mirror of
https://github.com/status-im/status-go.git
synced 2025-01-23 05:00:35 +00:00
67988 lines
1.8 MiB
67988 lines
1.8 MiB
// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT.
|
|
|
|
//go:build !appengine && !noasm && !nogen && nopshufb && gc
|
|
|
|
#include "textflag.h"
|
|
|
|
// func _dummy_()
|
|
TEXT ·_dummy_(SB), $0
|
|
#ifdef GOAMD64_v4
|
|
#define XOR3WAY(ignore, a, b, dst) \
|
|
VPTERNLOGD $0x96, a, b, dst
|
|
|
|
#else
|
|
#define XOR3WAY(ignore, a, b, dst) \
|
|
VPXOR a, dst, dst \
|
|
VPXOR b, dst, dst
|
|
|
|
#endif
|
|
RET
|
|
|
|
// sSE2XorSlice will XOR in with out and store in out.
|
|
// Processes 16 bytes/loop.
|
|
|
|
// func sSE2XorSlice(in []byte, out []byte)
|
|
// Requires: SSE2
|
|
TEXT ·sSE2XorSlice(SB), $0-48
|
|
MOVQ in_base+0(FP), AX
|
|
MOVQ out_base+24(FP), CX
|
|
MOVQ in_len+8(FP), DX
|
|
SHRQ $0x04, DX
|
|
JZ end
|
|
|
|
loop:
|
|
MOVOU (AX), X0
|
|
MOVOU (CX), X1
|
|
PXOR X0, X1
|
|
MOVOU X1, (CX)
|
|
ADDQ $0x10, AX
|
|
ADDQ $0x10, CX
|
|
DECQ DX
|
|
JNZ loop
|
|
|
|
end:
|
|
RET
|
|
|
|
// sSE2XorSlice_64 will XOR in with out and store in out.
|
|
// Processes 64 bytes/loop.
|
|
|
|
// func sSE2XorSlice_64(in []byte, out []byte)
|
|
// Requires: SSE2
|
|
TEXT ·sSE2XorSlice_64(SB), $0-48
|
|
MOVQ in_base+0(FP), AX
|
|
MOVQ out_base+24(FP), CX
|
|
MOVQ in_len+8(FP), DX
|
|
SHRQ $0x06, DX
|
|
JZ end
|
|
|
|
loop:
|
|
MOVOU (AX), X0
|
|
MOVOU 16(AX), X2
|
|
MOVOU 32(AX), X4
|
|
MOVOU 48(AX), X6
|
|
MOVOU (CX), X1
|
|
MOVOU 16(CX), X3
|
|
MOVOU 32(CX), X5
|
|
MOVOU 48(CX), X7
|
|
PXOR X0, X1
|
|
PXOR X2, X3
|
|
PXOR X4, X5
|
|
PXOR X6, X7
|
|
MOVOU X1, (CX)
|
|
MOVOU X3, 16(CX)
|
|
MOVOU X5, 32(CX)
|
|
MOVOU X7, 48(CX)
|
|
ADDQ $0x40, AX
|
|
ADDQ $0x40, CX
|
|
DECQ DX
|
|
JNZ loop
|
|
|
|
end:
|
|
RET
|
|
|
|
// avx2XorSlice_64 will XOR in with out and store in out.
|
|
// Processes 64 bytes/loop.
|
|
|
|
// func avx2XorSlice_64(in []byte, out []byte)
|
|
// Requires: AVX, AVX2
|
|
TEXT ·avx2XorSlice_64(SB), $0-48
|
|
MOVQ in_base+0(FP), AX
|
|
MOVQ out_base+24(FP), CX
|
|
MOVQ in_len+8(FP), DX
|
|
SHRQ $0x06, DX
|
|
JZ end
|
|
|
|
loop:
|
|
VMOVDQU (AX), Y0
|
|
VMOVDQU 32(AX), Y2
|
|
VMOVDQU (CX), Y1
|
|
VMOVDQU 32(CX), Y3
|
|
VPXOR Y0, Y1, Y1
|
|
VPXOR Y2, Y3, Y3
|
|
VMOVDQU Y1, (CX)
|
|
VMOVDQU Y3, 32(CX)
|
|
ADDQ $0x40, AX
|
|
ADDQ $0x40, CX
|
|
DECQ DX
|
|
JNZ loop
|
|
|
|
end:
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 4 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ start+72(FP), BX
|
|
|
|
// Add start offset to output
|
|
ADDQ BX, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ BX, CX
|
|
|
|
mulGFNI_1x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (CX), Z1
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z1, Z1
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z1, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 4 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ start+72(FP), BX
|
|
|
|
// Add start offset to output
|
|
ADDQ BX, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ BX, CX
|
|
|
|
mulAvxGFNI_1x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (CX), Y1
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y1, Y1
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y1, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 4 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ start+72(FP), BX
|
|
|
|
// Add start offset to output
|
|
ADDQ BX, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ BX, CX
|
|
|
|
mulGFNI_1x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (DX), Z1
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (CX), Z2
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z2, Z2
|
|
VXORPD Z1, Z2, Z1
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z1, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 4 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ start+72(FP), BX
|
|
|
|
// Add start offset to output
|
|
ADDQ BX, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ BX, CX
|
|
|
|
mulAvxGFNI_1x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (DX), Y1
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (CX), Y2
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y2, Y2
|
|
VXORPD Y1, Y2, Y1
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y1, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 6 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ start+72(FP), SI
|
|
|
|
// Add start offset to output
|
|
ADDQ SI, BX
|
|
ADDQ SI, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ SI, CX
|
|
|
|
mulGFNI_1x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (CX), Z3
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z2
|
|
VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z2, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z3, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x2(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 6 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ start+72(FP), SI
|
|
|
|
// Add start offset to output
|
|
ADDQ SI, BX
|
|
ADDQ SI, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ SI, CX
|
|
|
|
mulAvxGFNI_1x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (CX), Y3
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y3, Y2
|
|
VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y2, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y3, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 6 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ start+72(FP), SI
|
|
|
|
// Add start offset to output
|
|
ADDQ SI, BX
|
|
ADDQ SI, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ SI, CX
|
|
|
|
mulGFNI_1x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (BX), Z2
|
|
VMOVDQU64 (DX), Z3
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (CX), Z4
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
|
|
VXORPD Z2, Z5, Z2
|
|
VGF2P8AFFINEQB $0x00, Z1, Z4, Z5
|
|
VXORPD Z3, Z5, Z3
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z2, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z3, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 6 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ start+72(FP), SI
|
|
|
|
// Add start offset to output
|
|
ADDQ SI, BX
|
|
ADDQ SI, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ SI, CX
|
|
|
|
mulAvxGFNI_1x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (BX), Y2
|
|
VMOVDQU (DX), Y3
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (CX), Y4
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y4, Y5
|
|
VXORPD Y2, Y5, Y2
|
|
VGF2P8AFFINEQB $0x00, Y1, Y4, Y5
|
|
VXORPD Y3, Y5, Y3
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y2, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y3, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, BX
|
|
ADDQ DI, SI
|
|
ADDQ DI, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, CX
|
|
|
|
mulGFNI_1x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (CX), Z5
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z5, Z3
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z4
|
|
VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z3, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z4, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z5, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x3(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, BX
|
|
ADDQ DI, SI
|
|
ADDQ DI, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, CX
|
|
|
|
mulAvxGFNI_1x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (CX), Y5
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y5, Y3
|
|
VGF2P8AFFINEQB $0x00, Y1, Y5, Y4
|
|
VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y3, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y4, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y5, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, BX
|
|
ADDQ DI, SI
|
|
ADDQ DI, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, CX
|
|
|
|
mulGFNI_1x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (BX), Z3
|
|
VMOVDQU64 (SI), Z4
|
|
VMOVDQU64 (DX), Z5
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (CX), Z6
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z6, Z7
|
|
VXORPD Z3, Z7, Z3
|
|
VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
|
|
VXORPD Z4, Z7, Z4
|
|
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
|
|
VXORPD Z5, Z7, Z5
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z3, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z4, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z5, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, BX
|
|
ADDQ DI, SI
|
|
ADDQ DI, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, CX
|
|
|
|
mulAvxGFNI_1x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (BX), Y3
|
|
VMOVDQU (SI), Y4
|
|
VMOVDQU (DX), Y5
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (CX), Y6
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y6, Y7
|
|
VXORPD Y3, Y7, Y3
|
|
VGF2P8AFFINEQB $0x00, Y1, Y6, Y7
|
|
VXORPD Y4, Y7, Y4
|
|
VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
|
|
VXORPD Y5, Y7, Y5
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y3, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y4, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y5, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x4_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, BX
|
|
ADDQ R8, SI
|
|
ADDQ R8, DI
|
|
ADDQ R8, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, CX
|
|
|
|
mulGFNI_1x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (CX), Z7
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z7, Z4
|
|
VGF2P8AFFINEQB $0x00, Z1, Z7, Z5
|
|
VGF2P8AFFINEQB $0x00, Z2, Z7, Z6
|
|
VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z4, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z5, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z6, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z7, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x4(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, BX
|
|
ADDQ R8, SI
|
|
ADDQ R8, DI
|
|
ADDQ R8, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, CX
|
|
|
|
mulAvxGFNI_1x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (CX), Y7
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y7, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y7, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y7, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y4, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y5, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y6, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y7, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x4_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, BX
|
|
ADDQ R8, SI
|
|
ADDQ R8, DI
|
|
ADDQ R8, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, CX
|
|
|
|
mulGFNI_1x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (BX), Z4
|
|
VMOVDQU64 (SI), Z5
|
|
VMOVDQU64 (DI), Z6
|
|
VMOVDQU64 (DX), Z7
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (CX), Z8
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z8, Z9
|
|
VXORPD Z4, Z9, Z4
|
|
VGF2P8AFFINEQB $0x00, Z1, Z8, Z9
|
|
VXORPD Z5, Z9, Z5
|
|
VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
|
|
VXORPD Z6, Z9, Z6
|
|
VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
|
|
VXORPD Z7, Z9, Z7
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z4, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z5, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z6, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z7, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, BX
|
|
ADDQ R8, SI
|
|
ADDQ R8, DI
|
|
ADDQ R8, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, CX
|
|
|
|
mulAvxGFNI_1x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (BX), Y4
|
|
VMOVDQU (SI), Y5
|
|
VMOVDQU (DI), Y6
|
|
VMOVDQU (DX), Y7
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (CX), Y8
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y8, Y9
|
|
VXORPD Y4, Y9, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y8, Y9
|
|
VXORPD Y5, Y9, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
|
|
VXORPD Y6, Y9, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
|
|
VXORPD Y7, Y9, Y7
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y4, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y5, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y6, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y7, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x5_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_1x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (CX), Z9
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z9, Z5
|
|
VGF2P8AFFINEQB $0x00, Z1, Z9, Z6
|
|
VGF2P8AFFINEQB $0x00, Z2, Z9, Z7
|
|
VGF2P8AFFINEQB $0x00, Z3, Z9, Z8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z5, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z6, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z7, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z8, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z9, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x5(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_1x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (CX), Y9
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y9, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y9, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y9, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y9, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y5, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y6, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y7, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x5_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_1x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU64 (BX), Z5
|
|
VMOVDQU64 (SI), Z6
|
|
VMOVDQU64 (DI), Z7
|
|
VMOVDQU64 (R8), Z8
|
|
VMOVDQU64 (DX), Z9
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (CX), Z10
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z10, Z11
|
|
VXORPD Z5, Z11, Z5
|
|
VGF2P8AFFINEQB $0x00, Z1, Z10, Z11
|
|
VXORPD Z6, Z11, Z6
|
|
VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
|
|
VXORPD Z7, Z11, Z7
|
|
VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
|
|
VXORPD Z8, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
|
|
VXORPD Z9, Z11, Z9
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z5, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z6, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z7, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z8, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z9, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_1x5Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU (BX), Y5
|
|
VMOVDQU (SI), Y6
|
|
VMOVDQU (DI), Y7
|
|
VMOVDQU (R8), Y8
|
|
VMOVDQU (DX), Y9
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (CX), Y10
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y10, Y11
|
|
VXORPD Y5, Y11, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y10, Y11
|
|
VXORPD Y6, Y11, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
|
|
VXORPD Y7, Y11, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
|
|
VXORPD Y8, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
|
|
VXORPD Y9, Y11, Y9
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y5, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y6, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y7, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x6_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_1x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (CX), Z11
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z11, Z6
|
|
VGF2P8AFFINEQB $0x00, Z1, Z11, Z7
|
|
VGF2P8AFFINEQB $0x00, Z2, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z3, Z11, Z9
|
|
VGF2P8AFFINEQB $0x00, Z4, Z11, Z10
|
|
VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z6, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z7, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z8, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z9, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z10, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z11, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x6(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, CX
|
|
|
|
mulAvxGFNI_1x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (CX), Y11
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y11, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y11, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y11, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y11, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y6, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y7, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y8, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y9, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x6_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_1x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU64 (BX), Z6
|
|
VMOVDQU64 (SI), Z7
|
|
VMOVDQU64 (DI), Z8
|
|
VMOVDQU64 (R8), Z9
|
|
VMOVDQU64 (R9), Z10
|
|
VMOVDQU64 (DX), Z11
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (CX), Z12
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
|
|
VXORPD Z6, Z13, Z6
|
|
VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
|
|
VXORPD Z7, Z13, Z7
|
|
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
|
|
VXORPD Z8, Z13, Z8
|
|
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z6, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z7, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z8, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z9, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z10, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z11, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, CX
|
|
|
|
mulAvxGFNI_1x6Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU (BX), Y6
|
|
VMOVDQU (SI), Y7
|
|
VMOVDQU (DI), Y8
|
|
VMOVDQU (R8), Y9
|
|
VMOVDQU (R9), Y10
|
|
VMOVDQU (DX), Y11
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (CX), Y12
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
|
|
VXORPD Y6, Y13, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
|
|
VXORPD Y7, Y13, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
|
|
VXORPD Y8, Y13, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y6, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y7, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y8, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y9, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x7_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 16 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_1x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (CX), Z13
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z13, Z7
|
|
VGF2P8AFFINEQB $0x00, Z1, Z13, Z8
|
|
VGF2P8AFFINEQB $0x00, Z2, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z3, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z4, Z13, Z11
|
|
VGF2P8AFFINEQB $0x00, Z5, Z13, Z12
|
|
VGF2P8AFFINEQB $0x00, Z6, Z13, Z13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z7, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z8, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z9, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z10, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z11, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z12, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z13, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x7(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 16 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, CX
|
|
|
|
mulAvxGFNI_1x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (CX), Y13
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y13, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y13, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y13, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y13, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y13, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y8, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x7_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 16 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_1x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU64 (BX), Z7
|
|
VMOVDQU64 (SI), Z8
|
|
VMOVDQU64 (DI), Z9
|
|
VMOVDQU64 (R8), Z10
|
|
VMOVDQU64 (R9), Z11
|
|
VMOVDQU64 (R10), Z12
|
|
VMOVDQU64 (DX), Z13
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (CX), Z14
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z14, Z15
|
|
VXORPD Z7, Z15, Z7
|
|
VGF2P8AFFINEQB $0x00, Z1, Z14, Z15
|
|
VXORPD Z8, Z15, Z8
|
|
VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
|
|
VXORPD Z9, Z15, Z9
|
|
VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
|
|
VXORPD Z10, Z15, Z10
|
|
VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
|
|
VXORPD Z11, Z15, Z11
|
|
VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z7, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z8, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z9, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z10, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z11, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z12, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z13, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 16 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, CX
|
|
|
|
mulAvxGFNI_1x7Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU (BX), Y7
|
|
VMOVDQU (SI), Y8
|
|
VMOVDQU (DI), Y9
|
|
VMOVDQU (R8), Y10
|
|
VMOVDQU (R9), Y11
|
|
VMOVDQU (R10), Y12
|
|
VMOVDQU (DX), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (CX), Y14
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (BX)
|
|
ADDQ $0x20, BX
|
|
VMOVDQU Y8, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (DX)
|
|
ADDQ $0x20, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x8_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_1x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (CX), Z15
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z15, Z8
|
|
VGF2P8AFFINEQB $0x00, Z1, Z15, Z9
|
|
VGF2P8AFFINEQB $0x00, Z2, Z15, Z10
|
|
VGF2P8AFFINEQB $0x00, Z3, Z15, Z11
|
|
VGF2P8AFFINEQB $0x00, Z4, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z5, Z15, Z13
|
|
VGF2P8AFFINEQB $0x00, Z6, Z15, Z14
|
|
VGF2P8AFFINEQB $0x00, Z7, Z15, Z15
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z8, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z9, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z10, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z11, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z12, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z13, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z14, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z15, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x8(SB), $0-88
|
|
// Loading 6 of 8 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), BX
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_1x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (DX), Y13
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y13, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y13, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y13, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y13, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
|
|
VBROADCASTSD 56(CX), Y14
|
|
VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y7, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x8_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_1x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU64 (BX), Z8
|
|
VMOVDQU64 (SI), Z9
|
|
VMOVDQU64 (DI), Z10
|
|
VMOVDQU64 (R8), Z11
|
|
VMOVDQU64 (R9), Z12
|
|
VMOVDQU64 (R10), Z13
|
|
VMOVDQU64 (R11), Z14
|
|
VMOVDQU64 (DX), Z15
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (CX), Z16
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
|
|
VXORPD Z8, Z17, Z8
|
|
VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
|
|
VXORPD Z9, Z17, Z9
|
|
VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
|
|
VXORPD Z10, Z17, Z10
|
|
VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
|
|
VXORPD Z11, Z17, Z11
|
|
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
|
|
VXORPD Z12, Z17, Z12
|
|
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
|
|
VXORPD Z13, Z17, Z13
|
|
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z8, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z9, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z10, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z11, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z12, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z13, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z14, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z15, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88
|
|
// Loading 6 of 8 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), BX
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_1x8Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU (SI), Y6
|
|
VMOVDQU (DI), Y7
|
|
VMOVDQU (R8), Y8
|
|
VMOVDQU (R9), Y9
|
|
VMOVDQU (R10), Y10
|
|
VMOVDQU (R11), Y11
|
|
VMOVDQU (R12), Y12
|
|
VMOVDQU (BX), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y7, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x9_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_1x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (CX), Z17
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z17, Z9
|
|
VGF2P8AFFINEQB $0x00, Z1, Z17, Z10
|
|
VGF2P8AFFINEQB $0x00, Z2, Z17, Z11
|
|
VGF2P8AFFINEQB $0x00, Z3, Z17, Z12
|
|
VGF2P8AFFINEQB $0x00, Z4, Z17, Z13
|
|
VGF2P8AFFINEQB $0x00, Z5, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z6, Z17, Z15
|
|
VGF2P8AFFINEQB $0x00, Z7, Z17, Z16
|
|
VGF2P8AFFINEQB $0x00, Z8, Z17, Z17
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU64 Z9, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z10, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z11, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z12, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z13, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z14, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z15, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z16, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z17, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x9(SB), $0-88
|
|
// Loading 5 of 9 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), R13
|
|
MOVQ 192(BX), BX
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_1x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (DX), Y13
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y13, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y13, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y13, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y13, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y13, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y13, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y13, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
|
|
VBROADCASTSD 64(CX), Y14
|
|
VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU Y5, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y6, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y7, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y8, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x9_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_1x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
VMOVDQU64 (BX), Z9
|
|
VMOVDQU64 (SI), Z10
|
|
VMOVDQU64 (DI), Z11
|
|
VMOVDQU64 (R8), Z12
|
|
VMOVDQU64 (R9), Z13
|
|
VMOVDQU64 (R10), Z14
|
|
VMOVDQU64 (R11), Z15
|
|
VMOVDQU64 (R12), Z16
|
|
VMOVDQU64 (DX), Z17
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (CX), Z18
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
|
|
VXORPD Z9, Z19, Z9
|
|
VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
|
|
VXORPD Z10, Z19, Z10
|
|
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
|
|
VXORPD Z11, Z19, Z11
|
|
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
|
|
VXORPD Z12, Z19, Z12
|
|
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
|
|
VXORPD Z13, Z19, Z13
|
|
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
|
|
VXORPD Z14, Z19, Z14
|
|
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU64 Z9, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z10, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z11, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z12, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z13, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z14, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z15, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z16, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z17, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88
|
|
// Loading 5 of 9 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), R13
|
|
MOVQ 192(BX), BX
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_1x9Xor_loop:
|
|
// Load 9 outputs
|
|
VMOVDQU (SI), Y5
|
|
VMOVDQU (DI), Y6
|
|
VMOVDQU (R8), Y7
|
|
VMOVDQU (R9), Y8
|
|
VMOVDQU (R10), Y9
|
|
VMOVDQU (R11), Y10
|
|
VMOVDQU (R12), Y11
|
|
VMOVDQU (R13), Y12
|
|
VMOVDQU (BX), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU Y5, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y6, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y7, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y8, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x10_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_1x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (CX), Z19
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z19, Z10
|
|
VGF2P8AFFINEQB $0x00, Z1, Z19, Z11
|
|
VGF2P8AFFINEQB $0x00, Z2, Z19, Z12
|
|
VGF2P8AFFINEQB $0x00, Z3, Z19, Z13
|
|
VGF2P8AFFINEQB $0x00, Z4, Z19, Z14
|
|
VGF2P8AFFINEQB $0x00, Z5, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z6, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z7, Z19, Z17
|
|
VGF2P8AFFINEQB $0x00, Z8, Z19, Z18
|
|
VGF2P8AFFINEQB $0x00, Z9, Z19, Z19
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU64 Z10, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z11, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z12, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z13, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z14, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z15, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z16, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z17, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z18, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z19, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x10(SB), $0-88
|
|
// Loading 4 of 10 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), R13
|
|
MOVQ 192(BX), R14
|
|
MOVQ 216(BX), BX
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_1x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (DX), Y13
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y13, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y13, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y13, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y13, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y13, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y13, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y13, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y13, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
|
|
VBROADCASTSD 72(CX), Y14
|
|
VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU Y4, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y5, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y6, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y7, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_1x10_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_1x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), CX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ out_base+48(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, DX
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_1x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
VMOVDQU64 (BX), Z10
|
|
VMOVDQU64 (SI), Z11
|
|
VMOVDQU64 (DI), Z12
|
|
VMOVDQU64 (R8), Z13
|
|
VMOVDQU64 (R9), Z14
|
|
VMOVDQU64 (R10), Z15
|
|
VMOVDQU64 (R11), Z16
|
|
VMOVDQU64 (R12), Z17
|
|
VMOVDQU64 (R13), Z18
|
|
VMOVDQU64 (DX), Z19
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (CX), Z20
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
|
|
VXORPD Z10, Z21, Z10
|
|
VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
|
|
VXORPD Z11, Z21, Z11
|
|
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
|
|
VXORPD Z12, Z21, Z12
|
|
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
|
|
VXORPD Z13, Z21, Z13
|
|
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
|
|
VXORPD Z14, Z21, Z14
|
|
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
|
|
VXORPD Z15, Z21, Z15
|
|
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU64 Z10, (BX)
|
|
ADDQ $0x40, BX
|
|
VMOVDQU64 Z11, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z12, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z13, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z14, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z15, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z16, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z17, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z18, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z19, (DX)
|
|
ADDQ $0x40, DX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_1x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_1x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88
|
|
// Loading 4 of 10 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_1x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), DX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), R13
|
|
MOVQ 192(BX), R14
|
|
MOVQ 216(BX), BX
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_1x10Xor_loop:
|
|
// Load 10 outputs
|
|
VMOVDQU (SI), Y4
|
|
VMOVDQU (DI), Y5
|
|
VMOVDQU (R8), Y6
|
|
VMOVDQU (R9), Y7
|
|
VMOVDQU (R10), Y8
|
|
VMOVDQU (R11), Y9
|
|
VMOVDQU (R12), Y10
|
|
VMOVDQU (R13), Y11
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (BX), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU Y4, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y5, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y6, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y7, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_1x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_1x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 5 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), BX
|
|
MOVQ start+72(FP), SI
|
|
|
|
// Add start offset to output
|
|
ADDQ SI, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ SI, DX
|
|
ADDQ SI, CX
|
|
|
|
mulGFNI_2x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z3
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z2
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (CX), Z3
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
|
|
VXORPD Z2, Z3, Z2
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z2, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 5 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), BX
|
|
MOVQ start+72(FP), SI
|
|
|
|
// Add start offset to output
|
|
ADDQ SI, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ SI, DX
|
|
ADDQ SI, CX
|
|
|
|
mulAvxGFNI_2x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y3
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y3, Y2
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (CX), Y3
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
|
|
VXORPD Y2, Y3, Y2
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y2, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 5 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), BX
|
|
MOVQ start+72(FP), SI
|
|
|
|
// Add start offset to output
|
|
ADDQ SI, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ SI, DX
|
|
ADDQ SI, CX
|
|
|
|
mulGFNI_2x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (BX), Z2
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z3
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z3
|
|
VXORPD Z2, Z3, Z2
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (CX), Z3
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
|
|
VXORPD Z2, Z3, Z2
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z2, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 5 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), BX
|
|
MOVQ start+72(FP), SI
|
|
|
|
// Add start offset to output
|
|
ADDQ SI, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ SI, DX
|
|
ADDQ SI, CX
|
|
|
|
mulAvxGFNI_2x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (BX), Y2
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y3
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y3, Y3
|
|
VXORPD Y2, Y3, Y2
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (CX), Y3
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
|
|
VXORPD Y2, Y3, Y2
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y2, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), BX
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, SI
|
|
ADDQ DI, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, DX
|
|
ADDQ DI, CX
|
|
|
|
mulGFNI_2x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z6
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z6, Z4
|
|
VGF2P8AFFINEQB $0x00, Z1, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (CX), Z6
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
|
|
VXORPD Z4, Z7, Z4
|
|
VGF2P8AFFINEQB $0x00, Z3, Z6, Z7
|
|
VXORPD Z5, Z7, Z5
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z4, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z5, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x2(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), BX
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, SI
|
|
ADDQ DI, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, DX
|
|
ADDQ DI, CX
|
|
|
|
mulAvxGFNI_2x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y6
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y6, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (CX), Y6
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
|
|
VXORPD Y4, Y7, Y4
|
|
VGF2P8AFFINEQB $0x00, Y3, Y6, Y7
|
|
VXORPD Y5, Y7, Y5
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y4, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y5, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), BX
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, SI
|
|
ADDQ DI, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, DX
|
|
ADDQ DI, CX
|
|
|
|
mulGFNI_2x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (SI), Z4
|
|
VMOVDQU64 (BX), Z5
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z6
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z6, Z7
|
|
VXORPD Z4, Z7, Z4
|
|
VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
|
|
VXORPD Z5, Z7, Z5
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (CX), Z6
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
|
|
VXORPD Z4, Z7, Z4
|
|
VGF2P8AFFINEQB $0x00, Z3, Z6, Z7
|
|
VXORPD Z5, Z7, Z5
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z4, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z5, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), BX
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, SI
|
|
ADDQ DI, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, DX
|
|
ADDQ DI, CX
|
|
|
|
mulAvxGFNI_2x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (SI), Y4
|
|
VMOVDQU (BX), Y5
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y6
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y6, Y7
|
|
VXORPD Y4, Y7, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y6, Y7
|
|
VXORPD Y5, Y7, Y5
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (CX), Y6
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
|
|
VXORPD Y4, Y7, Y4
|
|
VGF2P8AFFINEQB $0x00, Y3, Y6, Y7
|
|
VXORPD Y5, Y7, Y5
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y4, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y5, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 11 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), BX
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, SI
|
|
ADDQ R8, DI
|
|
ADDQ R8, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, CX
|
|
|
|
mulGFNI_2x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z9
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z9, Z6
|
|
VGF2P8AFFINEQB $0x00, Z1, Z9, Z7
|
|
VGF2P8AFFINEQB $0x00, Z2, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (CX), Z9
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z9, Z10
|
|
VXORPD Z6, Z10, Z6
|
|
VGF2P8AFFINEQB $0x00, Z4, Z9, Z10
|
|
VXORPD Z7, Z10, Z7
|
|
VGF2P8AFFINEQB $0x00, Z5, Z9, Z10
|
|
VXORPD Z8, Z10, Z8
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z6, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z7, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z8, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x3(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 11 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), BX
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, SI
|
|
ADDQ R8, DI
|
|
ADDQ R8, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, CX
|
|
|
|
mulAvxGFNI_2x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (DX), Y9
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y9, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y9, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (CX), Y9
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y3, Y9, Y10
|
|
VXORPD Y6, Y10, Y6
|
|
VGF2P8AFFINEQB $0x00, Y4, Y9, Y10
|
|
VXORPD Y7, Y10, Y7
|
|
VGF2P8AFFINEQB $0x00, Y5, Y9, Y10
|
|
VXORPD Y8, Y10, Y8
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y6, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y7, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y8, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 11 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), BX
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, SI
|
|
ADDQ R8, DI
|
|
ADDQ R8, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, CX
|
|
|
|
mulGFNI_2x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (SI), Z6
|
|
VMOVDQU64 (DI), Z7
|
|
VMOVDQU64 (BX), Z8
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z9
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z9, Z10
|
|
VXORPD Z6, Z10, Z6
|
|
VGF2P8AFFINEQB $0x00, Z1, Z9, Z10
|
|
VXORPD Z7, Z10, Z7
|
|
VGF2P8AFFINEQB $0x00, Z2, Z9, Z10
|
|
VXORPD Z8, Z10, Z8
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (CX), Z9
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z9, Z10
|
|
VXORPD Z6, Z10, Z6
|
|
VGF2P8AFFINEQB $0x00, Z4, Z9, Z10
|
|
VXORPD Z7, Z10, Z7
|
|
VGF2P8AFFINEQB $0x00, Z5, Z9, Z10
|
|
VXORPD Z8, Z10, Z8
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z6, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z7, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z8, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 11 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), BX
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, SI
|
|
ADDQ R8, DI
|
|
ADDQ R8, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, CX
|
|
|
|
mulAvxGFNI_2x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (SI), Y6
|
|
VMOVDQU (DI), Y7
|
|
VMOVDQU (BX), Y8
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (DX), Y9
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y9, Y10
|
|
VXORPD Y6, Y10, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y9, Y10
|
|
VXORPD Y7, Y10, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y9, Y10
|
|
VXORPD Y8, Y10, Y8
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (CX), Y9
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y3, Y9, Y10
|
|
VXORPD Y6, Y10, Y6
|
|
VGF2P8AFFINEQB $0x00, Y4, Y9, Y10
|
|
VXORPD Y7, Y10, Y7
|
|
VGF2P8AFFINEQB $0x00, Y5, Y9, Y10
|
|
VXORPD Y8, Y10, Y8
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y6, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y7, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y8, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x4_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), BX
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_2x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z12
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z12, Z8
|
|
VGF2P8AFFINEQB $0x00, Z1, Z12, Z9
|
|
VGF2P8AFFINEQB $0x00, Z2, Z12, Z10
|
|
VGF2P8AFFINEQB $0x00, Z3, Z12, Z11
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (CX), Z12
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
|
|
VXORPD Z8, Z13, Z8
|
|
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z8, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z9, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z10, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z11, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x4(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), BX
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_2x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (DX), Y12
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y12, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y12, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y12, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y12, Y11
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (CX), Y12
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
|
|
VXORPD Y8, Y13, Y8
|
|
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y8, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x4_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), BX
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_2x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (SI), Z8
|
|
VMOVDQU64 (DI), Z9
|
|
VMOVDQU64 (R8), Z10
|
|
VMOVDQU64 (BX), Z11
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z12
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
|
|
VXORPD Z8, Z13, Z8
|
|
VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (CX), Z12
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
|
|
VXORPD Z8, Z13, Z8
|
|
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z8, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z9, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z10, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z11, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), BX
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_2x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (SI), Y8
|
|
VMOVDQU (DI), Y9
|
|
VMOVDQU (R8), Y10
|
|
VMOVDQU (BX), Y11
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (DX), Y12
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
|
|
VXORPD Y8, Y13, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (CX), Y12
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
|
|
VXORPD Y8, Y13, Y8
|
|
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y8, (SI)
|
|
ADDQ $0x20, SI
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (BX)
|
|
ADDQ $0x20, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x5_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 17 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), BX
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_2x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z15
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z15, Z10
|
|
VGF2P8AFFINEQB $0x00, Z1, Z15, Z11
|
|
VGF2P8AFFINEQB $0x00, Z2, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z3, Z15, Z13
|
|
VGF2P8AFFINEQB $0x00, Z4, Z15, Z14
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (CX), Z15
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
|
|
VXORPD Z10, Z16, Z10
|
|
VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
|
|
VXORPD Z11, Z16, Z11
|
|
VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z10, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z11, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z12, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z13, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z14, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x5(SB), $0-88
|
|
// Loading 9 of 10 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 17 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), SI
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_2x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x5_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 17 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), BX
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_2x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU64 (SI), Z10
|
|
VMOVDQU64 (DI), Z11
|
|
VMOVDQU64 (R8), Z12
|
|
VMOVDQU64 (R9), Z13
|
|
VMOVDQU64 (BX), Z14
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z15
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z15, Z16
|
|
VXORPD Z10, Z16, Z10
|
|
VGF2P8AFFINEQB $0x00, Z1, Z15, Z16
|
|
VXORPD Z11, Z16, Z11
|
|
VGF2P8AFFINEQB $0x00, Z2, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (CX), Z15
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
|
|
VXORPD Z10, Z16, Z10
|
|
VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
|
|
VXORPD Z11, Z16, Z11
|
|
VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z10, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z11, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z12, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z13, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z14, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88
|
|
// Loading 9 of 10 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 17 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), SI
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_2x5Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU (DI), Y9
|
|
VMOVDQU (R8), Y10
|
|
VMOVDQU (R9), Y11
|
|
VMOVDQU (R10), Y12
|
|
VMOVDQU (SI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x6_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), BX
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_2x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (DX), Z18
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z18, Z12
|
|
VGF2P8AFFINEQB $0x00, Z1, Z18, Z13
|
|
VGF2P8AFFINEQB $0x00, Z2, Z18, Z14
|
|
VGF2P8AFFINEQB $0x00, Z3, Z18, Z15
|
|
VGF2P8AFFINEQB $0x00, Z4, Z18, Z16
|
|
VGF2P8AFFINEQB $0x00, Z5, Z18, Z17
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (CX), Z18
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
|
|
VXORPD Z12, Z19, Z12
|
|
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
|
|
VXORPD Z13, Z19, Z13
|
|
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
|
|
VXORPD Z14, Z19, Z14
|
|
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z12, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z13, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z14, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z15, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z16, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z17, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x6(SB), $0-88
|
|
// Loading 8 of 12 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), SI
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_2x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y9, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y12, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x6_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), BX
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_2x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU64 (SI), Z12
|
|
VMOVDQU64 (DI), Z13
|
|
VMOVDQU64 (R8), Z14
|
|
VMOVDQU64 (R9), Z15
|
|
VMOVDQU64 (R10), Z16
|
|
VMOVDQU64 (BX), Z17
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (DX), Z18
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
|
|
VXORPD Z12, Z19, Z12
|
|
VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
|
|
VXORPD Z13, Z19, Z13
|
|
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
|
|
VXORPD Z14, Z19, Z14
|
|
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (CX), Z18
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
|
|
VXORPD Z12, Z19, Z12
|
|
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
|
|
VXORPD Z13, Z19, Z13
|
|
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
|
|
VXORPD Z14, Z19, Z14
|
|
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z12, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z13, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z14, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z15, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z16, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z17, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88
|
|
// Loading 8 of 12 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), SI
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_2x6Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU (DI), Y8
|
|
VMOVDQU (R8), Y9
|
|
VMOVDQU (R9), Y10
|
|
VMOVDQU (R10), Y11
|
|
VMOVDQU (R11), Y12
|
|
VMOVDQU (SI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y9, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y12, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x7_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 23 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), BX
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_2x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (DX), Z21
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z21, Z14
|
|
VGF2P8AFFINEQB $0x00, Z1, Z21, Z15
|
|
VGF2P8AFFINEQB $0x00, Z2, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z3, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z4, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z5, Z21, Z19
|
|
VGF2P8AFFINEQB $0x00, Z6, Z21, Z20
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (CX), Z21
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
|
|
VXORPD Z14, Z22, Z14
|
|
VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
|
|
VXORPD Z15, Z22, Z15
|
|
VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
|
|
VXORPD Z16, Z22, Z16
|
|
VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
|
|
VXORPD Z17, Z22, Z17
|
|
VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z14, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z15, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z16, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z17, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z18, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z19, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z20, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x7(SB), $0-88
|
|
// Loading 7 of 14 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 23 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), SI
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_2x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x7_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 23 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), BX
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_2x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU64 (SI), Z14
|
|
VMOVDQU64 (DI), Z15
|
|
VMOVDQU64 (R8), Z16
|
|
VMOVDQU64 (R9), Z17
|
|
VMOVDQU64 (R10), Z18
|
|
VMOVDQU64 (R11), Z19
|
|
VMOVDQU64 (BX), Z20
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (DX), Z21
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z21, Z22
|
|
VXORPD Z14, Z22, Z14
|
|
VGF2P8AFFINEQB $0x00, Z1, Z21, Z22
|
|
VXORPD Z15, Z22, Z15
|
|
VGF2P8AFFINEQB $0x00, Z2, Z21, Z22
|
|
VXORPD Z16, Z22, Z16
|
|
VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
|
|
VXORPD Z17, Z22, Z17
|
|
VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (CX), Z21
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
|
|
VXORPD Z14, Z22, Z14
|
|
VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
|
|
VXORPD Z15, Z22, Z15
|
|
VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
|
|
VXORPD Z16, Z22, Z16
|
|
VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
|
|
VXORPD Z17, Z22, Z17
|
|
VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z14, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z15, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z16, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z17, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z18, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z19, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z20, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88
|
|
// Loading 7 of 14 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 23 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), SI
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_2x7Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU (DI), Y7
|
|
VMOVDQU (R8), Y8
|
|
VMOVDQU (R9), Y9
|
|
VMOVDQU (R10), Y10
|
|
VMOVDQU (R11), Y11
|
|
VMOVDQU (R12), Y12
|
|
VMOVDQU (SI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x8_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), BX
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_2x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (DX), Z24
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z24, Z16
|
|
VGF2P8AFFINEQB $0x00, Z1, Z24, Z17
|
|
VGF2P8AFFINEQB $0x00, Z2, Z24, Z18
|
|
VGF2P8AFFINEQB $0x00, Z3, Z24, Z19
|
|
VGF2P8AFFINEQB $0x00, Z4, Z24, Z20
|
|
VGF2P8AFFINEQB $0x00, Z5, Z24, Z21
|
|
VGF2P8AFFINEQB $0x00, Z6, Z24, Z22
|
|
VGF2P8AFFINEQB $0x00, Z7, Z24, Z23
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (CX), Z24
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
|
|
VXORPD Z16, Z25, Z16
|
|
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
|
|
VXORPD Z17, Z25, Z17
|
|
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
|
|
VXORPD Z18, Z25, Z18
|
|
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
|
|
VXORPD Z19, Z25, Z19
|
|
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z16, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z17, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z18, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z19, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z20, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z21, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z22, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z23, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x8(SB), $0-88
|
|
// Loading 6 of 16 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), SI
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_2x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y7, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y8, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x8_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), BX
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_2x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU64 (SI), Z16
|
|
VMOVDQU64 (DI), Z17
|
|
VMOVDQU64 (R8), Z18
|
|
VMOVDQU64 (R9), Z19
|
|
VMOVDQU64 (R10), Z20
|
|
VMOVDQU64 (R11), Z21
|
|
VMOVDQU64 (R12), Z22
|
|
VMOVDQU64 (BX), Z23
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (DX), Z24
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
|
|
VXORPD Z16, Z25, Z16
|
|
VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
|
|
VXORPD Z17, Z25, Z17
|
|
VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
|
|
VXORPD Z18, Z25, Z18
|
|
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
|
|
VXORPD Z19, Z25, Z19
|
|
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (CX), Z24
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
|
|
VXORPD Z16, Z25, Z16
|
|
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
|
|
VXORPD Z17, Z25, Z17
|
|
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
|
|
VXORPD Z18, Z25, Z18
|
|
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
|
|
VXORPD Z19, Z25, Z19
|
|
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z16, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z17, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z18, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z19, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z20, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z21, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z22, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z23, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88
|
|
// Loading 6 of 16 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), SI
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_2x8Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU (DI), Y6
|
|
VMOVDQU (R8), Y7
|
|
VMOVDQU (R9), Y8
|
|
VMOVDQU (R10), Y9
|
|
VMOVDQU (R11), Y10
|
|
VMOVDQU (R12), Y11
|
|
VMOVDQU (R13), Y12
|
|
VMOVDQU (SI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y7, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y8, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x9_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 29 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), R13
|
|
MOVQ 192(BX), BX
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_2x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (DX), Z27
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z27, Z18
|
|
VGF2P8AFFINEQB $0x00, Z1, Z27, Z19
|
|
VGF2P8AFFINEQB $0x00, Z2, Z27, Z20
|
|
VGF2P8AFFINEQB $0x00, Z3, Z27, Z21
|
|
VGF2P8AFFINEQB $0x00, Z4, Z27, Z22
|
|
VGF2P8AFFINEQB $0x00, Z5, Z27, Z23
|
|
VGF2P8AFFINEQB $0x00, Z6, Z27, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z27, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z27, Z26
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (CX), Z27
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
|
|
VXORPD Z18, Z28, Z18
|
|
VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
|
|
VXORPD Z19, Z28, Z19
|
|
VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
|
|
VXORPD Z20, Z28, Z20
|
|
VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
|
|
VXORPD Z21, Z28, Z21
|
|
VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
|
|
VXORPD Z22, Z28, Z22
|
|
VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
|
|
VXORPD Z23, Z28, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU64 Z18, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z19, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z20, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z21, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z22, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z23, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z24, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z25, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z26, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x9(SB), $0-88
|
|
// Loading 5 of 18 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 29 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), R14
|
|
MOVQ 192(SI), SI
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_2x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU Y5, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y6, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y7, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x9_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 29 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), R13
|
|
MOVQ 192(BX), BX
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_2x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
VMOVDQU64 (SI), Z18
|
|
VMOVDQU64 (DI), Z19
|
|
VMOVDQU64 (R8), Z20
|
|
VMOVDQU64 (R9), Z21
|
|
VMOVDQU64 (R10), Z22
|
|
VMOVDQU64 (R11), Z23
|
|
VMOVDQU64 (R12), Z24
|
|
VMOVDQU64 (R13), Z25
|
|
VMOVDQU64 (BX), Z26
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (DX), Z27
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z27, Z28
|
|
VXORPD Z18, Z28, Z18
|
|
VGF2P8AFFINEQB $0x00, Z1, Z27, Z28
|
|
VXORPD Z19, Z28, Z19
|
|
VGF2P8AFFINEQB $0x00, Z2, Z27, Z28
|
|
VXORPD Z20, Z28, Z20
|
|
VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
|
|
VXORPD Z21, Z28, Z21
|
|
VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
|
|
VXORPD Z22, Z28, Z22
|
|
VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
|
|
VXORPD Z23, Z28, Z23
|
|
VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (CX), Z27
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
|
|
VXORPD Z18, Z28, Z18
|
|
VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
|
|
VXORPD Z19, Z28, Z19
|
|
VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
|
|
VXORPD Z20, Z28, Z20
|
|
VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
|
|
VXORPD Z21, Z28, Z21
|
|
VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
|
|
VXORPD Z22, Z28, Z22
|
|
VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
|
|
VXORPD Z23, Z28, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU64 Z18, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z19, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z20, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z21, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z22, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z23, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z24, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z25, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z26, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88
|
|
// Loading 5 of 18 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 29 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), R14
|
|
MOVQ 192(SI), SI
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_2x9Xor_loop:
|
|
// Load 9 outputs
|
|
VMOVDQU (DI), Y5
|
|
VMOVDQU (R8), Y6
|
|
VMOVDQU (R9), Y7
|
|
VMOVDQU (R10), Y8
|
|
VMOVDQU (R11), Y9
|
|
VMOVDQU (R12), Y10
|
|
VMOVDQU (R13), Y11
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (SI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU Y5, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y6, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y7, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x10_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), R13
|
|
MOVQ 192(BX), R14
|
|
MOVQ 216(BX), BX
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, DX
|
|
ADDQ R15, CX
|
|
|
|
mulGFNI_2x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (CX), Z30
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU64 Z20, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z21, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z22, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z23, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x10(SB), $8-88
|
|
// Loading 4 of 20 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), R14
|
|
MOVQ 192(SI), R15
|
|
MOVQ 216(SI), SI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_2x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU Y4, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_2x10_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_2x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), CX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ out_base+48(FP), BX
|
|
MOVQ (BX), SI
|
|
MOVQ 24(BX), DI
|
|
MOVQ 48(BX), R8
|
|
MOVQ 72(BX), R9
|
|
MOVQ 96(BX), R10
|
|
MOVQ 120(BX), R11
|
|
MOVQ 144(BX), R12
|
|
MOVQ 168(BX), R13
|
|
MOVQ 192(BX), R14
|
|
MOVQ 216(BX), BX
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, BX
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, DX
|
|
ADDQ R15, CX
|
|
|
|
mulGFNI_2x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
VMOVDQU64 (SI), Z20
|
|
VMOVDQU64 (DI), Z21
|
|
VMOVDQU64 (R8), Z22
|
|
VMOVDQU64 (R9), Z23
|
|
VMOVDQU64 (R10), Z24
|
|
VMOVDQU64 (R11), Z25
|
|
VMOVDQU64 (R12), Z26
|
|
VMOVDQU64 (R13), Z27
|
|
VMOVDQU64 (R14), Z28
|
|
VMOVDQU64 (BX), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (CX), Z30
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU64 Z20, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z21, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z22, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z23, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (BX)
|
|
ADDQ $0x40, BX
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_2x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_2x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88
|
|
// Loading 4 of 20 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_2x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), DX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), R14
|
|
MOVQ 192(SI), R15
|
|
MOVQ 216(SI), SI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_2x10Xor_loop:
|
|
// Load 10 outputs
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU (R8), Y5
|
|
VMOVDQU (R9), Y6
|
|
VMOVDQU (R10), Y7
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (SI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU Y4, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_2x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_2x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 6 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), SI
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, DX
|
|
ADDQ DI, BX
|
|
ADDQ DI, CX
|
|
|
|
mulGFNI_3x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z4
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z3
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z4
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z4, Z4
|
|
VXORPD Z3, Z4, Z3
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (CX), Z4
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z4, Z4
|
|
VXORPD Z3, Z4, Z3
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z3, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 6 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), SI
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, DX
|
|
ADDQ DI, BX
|
|
ADDQ DI, CX
|
|
|
|
mulAvxGFNI_3x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y4
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y4, Y3
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y4
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y4, Y4
|
|
VXORPD Y3, Y4, Y3
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (CX), Y4
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y4, Y4
|
|
VXORPD Y3, Y4, Y3
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y3, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 6 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), SI
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, DX
|
|
ADDQ DI, BX
|
|
ADDQ DI, CX
|
|
|
|
mulGFNI_3x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (SI), Z3
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z4
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z4
|
|
VXORPD Z3, Z4, Z3
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z4
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z4, Z4
|
|
VXORPD Z3, Z4, Z3
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (CX), Z4
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z4, Z4
|
|
VXORPD Z3, Z4, Z3
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z3, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 6 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), SI
|
|
MOVQ start+72(FP), DI
|
|
|
|
// Add start offset to output
|
|
ADDQ DI, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ DI, DX
|
|
ADDQ DI, BX
|
|
ADDQ DI, CX
|
|
|
|
mulAvxGFNI_3x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (SI), Y3
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y4
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y4, Y4
|
|
VXORPD Y3, Y4, Y3
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y4
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y4, Y4
|
|
VXORPD Y3, Y4, Y3
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (CX), Y4
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y4, Y4
|
|
VXORPD Y3, Y4, Y3
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y3, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), SI
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, DI
|
|
ADDQ R8, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, BX
|
|
ADDQ R8, CX
|
|
|
|
mulGFNI_3x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z8
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z8, Z6
|
|
VGF2P8AFFINEQB $0x00, Z1, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z8
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
|
|
VXORPD Z6, Z9, Z6
|
|
VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
|
|
VXORPD Z7, Z9, Z7
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (CX), Z8
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z8, Z9
|
|
VXORPD Z6, Z9, Z6
|
|
VGF2P8AFFINEQB $0x00, Z5, Z8, Z9
|
|
VXORPD Z7, Z9, Z7
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z6, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z7, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x2(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), SI
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, DI
|
|
ADDQ R8, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, BX
|
|
ADDQ R8, CX
|
|
|
|
mulAvxGFNI_3x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y8
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y8, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (BX), Y8
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
|
|
VXORPD Y6, Y9, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
|
|
VXORPD Y7, Y9, Y7
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (CX), Y8
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y4, Y8, Y9
|
|
VXORPD Y6, Y9, Y6
|
|
VGF2P8AFFINEQB $0x00, Y5, Y8, Y9
|
|
VXORPD Y7, Y9, Y7
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y6, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y7, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), SI
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, DI
|
|
ADDQ R8, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, BX
|
|
ADDQ R8, CX
|
|
|
|
mulGFNI_3x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (DI), Z6
|
|
VMOVDQU64 (SI), Z7
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z8
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z8, Z9
|
|
VXORPD Z6, Z9, Z6
|
|
VGF2P8AFFINEQB $0x00, Z1, Z8, Z9
|
|
VXORPD Z7, Z9, Z7
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z8
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
|
|
VXORPD Z6, Z9, Z6
|
|
VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
|
|
VXORPD Z7, Z9, Z7
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (CX), Z8
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z8, Z9
|
|
VXORPD Z6, Z9, Z6
|
|
VGF2P8AFFINEQB $0x00, Z5, Z8, Z9
|
|
VXORPD Z7, Z9, Z7
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z6, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z7, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), SI
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, DI
|
|
ADDQ R8, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, BX
|
|
ADDQ R8, CX
|
|
|
|
mulAvxGFNI_3x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (DI), Y6
|
|
VMOVDQU (SI), Y7
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y8
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y8, Y9
|
|
VXORPD Y6, Y9, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y8, Y9
|
|
VXORPD Y7, Y9, Y7
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (BX), Y8
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
|
|
VXORPD Y6, Y9, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
|
|
VXORPD Y7, Y9, Y7
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (CX), Y8
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y4, Y8, Y9
|
|
VXORPD Y6, Y9, Y6
|
|
VGF2P8AFFINEQB $0x00, Y5, Y8, Y9
|
|
VXORPD Y7, Y9, Y7
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y6, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y7, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), SI
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_3x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z12
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z12, Z9
|
|
VGF2P8AFFINEQB $0x00, Z1, Z12, Z10
|
|
VGF2P8AFFINEQB $0x00, Z2, Z12, Z11
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z12
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (CX), Z12
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z9, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z10, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z11, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x3(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), SI
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_3x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (DX), Y12
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y12, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y12, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y12, Y11
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (BX), Y12
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (CX), Y12
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), SI
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_3x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (DI), Z9
|
|
VMOVDQU64 (R8), Z10
|
|
VMOVDQU64 (SI), Z11
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z12
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z12
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (CX), Z12
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
|
|
VXORPD Z9, Z13, Z9
|
|
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z9, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z10, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z11, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), SI
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, DI
|
|
ADDQ R9, R8
|
|
ADDQ R9, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_3x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (DI), Y9
|
|
VMOVDQU (R8), Y10
|
|
VMOVDQU (SI), Y11
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (DX), Y12
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (BX), Y12
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (CX), Y12
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
|
|
VXORPD Y9, Y13, Y9
|
|
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x4_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), SI
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_3x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z16
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z1, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z2, Z16, Z14
|
|
VGF2P8AFFINEQB $0x00, Z3, Z16, Z15
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z16
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
|
|
VXORPD Z12, Z17, Z12
|
|
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
|
|
VXORPD Z13, Z17, Z13
|
|
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (CX), Z16
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
|
|
VXORPD Z12, Z17, Z12
|
|
VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
|
|
VXORPD Z13, Z17, Z13
|
|
VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z12, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z13, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z14, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z15, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x4(SB), $0-88
|
|
// Loading 10 of 12 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), DI
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_3x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x4_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), SI
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_3x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (DI), Z12
|
|
VMOVDQU64 (R8), Z13
|
|
VMOVDQU64 (R9), Z14
|
|
VMOVDQU64 (SI), Z15
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z16
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
|
|
VXORPD Z12, Z17, Z12
|
|
VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
|
|
VXORPD Z13, Z17, Z13
|
|
VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z16
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
|
|
VXORPD Z12, Z17, Z12
|
|
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
|
|
VXORPD Z13, Z17, Z13
|
|
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (CX), Z16
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
|
|
VXORPD Z12, Z17, Z12
|
|
VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
|
|
VXORPD Z13, Z17, Z13
|
|
VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z12, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z13, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z14, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z15, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88
|
|
// Loading 10 of 12 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), DI
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_3x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (R8), Y10
|
|
VMOVDQU (R9), Y11
|
|
VMOVDQU (R10), Y12
|
|
VMOVDQU (DI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y11, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x5_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), SI
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_3x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z20
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z20, Z15
|
|
VGF2P8AFFINEQB $0x00, Z1, Z20, Z16
|
|
VGF2P8AFFINEQB $0x00, Z2, Z20, Z17
|
|
VGF2P8AFFINEQB $0x00, Z3, Z20, Z18
|
|
VGF2P8AFFINEQB $0x00, Z4, Z20, Z19
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (BX), Z20
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
|
|
VXORPD Z15, Z21, Z15
|
|
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (CX), Z20
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
|
|
VXORPD Z15, Z21, Z15
|
|
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z15, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z16, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z17, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z18, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z19, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x5(SB), $0-88
|
|
// Loading 9 of 15 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), DI
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_3x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y12, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x5_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), SI
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_3x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU64 (DI), Z15
|
|
VMOVDQU64 (R8), Z16
|
|
VMOVDQU64 (R9), Z17
|
|
VMOVDQU64 (R10), Z18
|
|
VMOVDQU64 (SI), Z19
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z20
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
|
|
VXORPD Z15, Z21, Z15
|
|
VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (BX), Z20
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
|
|
VXORPD Z15, Z21, Z15
|
|
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (CX), Z20
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
|
|
VXORPD Z15, Z21, Z15
|
|
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z15, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z16, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z17, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z18, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z19, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88
|
|
// Loading 9 of 15 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), DI
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_3x5Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU (R8), Y9
|
|
VMOVDQU (R9), Y10
|
|
VMOVDQU (R10), Y11
|
|
VMOVDQU (R11), Y12
|
|
VMOVDQU (DI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y12, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x6_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), SI
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_3x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (DX), Z24
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z24, Z18
|
|
VGF2P8AFFINEQB $0x00, Z1, Z24, Z19
|
|
VGF2P8AFFINEQB $0x00, Z2, Z24, Z20
|
|
VGF2P8AFFINEQB $0x00, Z3, Z24, Z21
|
|
VGF2P8AFFINEQB $0x00, Z4, Z24, Z22
|
|
VGF2P8AFFINEQB $0x00, Z5, Z24, Z23
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (BX), Z24
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
|
|
VXORPD Z18, Z25, Z18
|
|
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
|
|
VXORPD Z19, Z25, Z19
|
|
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (CX), Z24
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
|
|
VXORPD Z18, Z25, Z18
|
|
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
|
|
VXORPD Z19, Z25, Z19
|
|
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z18, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z19, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z20, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z21, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z22, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z23, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x6(SB), $0-88
|
|
// Loading 8 of 18 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), DI
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_3x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x6_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), SI
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_3x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU64 (DI), Z18
|
|
VMOVDQU64 (R8), Z19
|
|
VMOVDQU64 (R9), Z20
|
|
VMOVDQU64 (R10), Z21
|
|
VMOVDQU64 (R11), Z22
|
|
VMOVDQU64 (SI), Z23
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (DX), Z24
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
|
|
VXORPD Z18, Z25, Z18
|
|
VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
|
|
VXORPD Z19, Z25, Z19
|
|
VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (BX), Z24
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
|
|
VXORPD Z18, Z25, Z18
|
|
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
|
|
VXORPD Z19, Z25, Z19
|
|
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (CX), Z24
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
|
|
VXORPD Z18, Z25, Z18
|
|
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
|
|
VXORPD Z19, Z25, Z19
|
|
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z18, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z19, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z20, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z21, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z22, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z23, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88
|
|
// Loading 8 of 18 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), DI
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_3x6Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU (R8), Y8
|
|
VMOVDQU (R9), Y9
|
|
VMOVDQU (R10), Y10
|
|
VMOVDQU (R11), Y11
|
|
VMOVDQU (R12), Y12
|
|
VMOVDQU (DI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x7_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 30 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), SI
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_3x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (DX), Z28
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z28, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z28, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z28, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z28, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z28, Z27
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (BX), Z28
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
|
|
VXORPD Z21, Z29, Z21
|
|
VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
|
|
VXORPD Z22, Z29, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
|
|
VXORPD Z23, Z29, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (CX), Z28
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
|
|
VXORPD Z21, Z29, Z21
|
|
VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
|
|
VXORPD Z22, Z29, Z22
|
|
VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
|
|
VXORPD Z23, Z29, Z23
|
|
VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z21, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z22, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z23, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x7(SB), $0-88
|
|
// Loading 7 of 21 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 30 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), DI
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_3x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y8, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x7_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 30 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), CX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), SI
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_3x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU64 (DI), Z21
|
|
VMOVDQU64 (R8), Z22
|
|
VMOVDQU64 (R9), Z23
|
|
VMOVDQU64 (R10), Z24
|
|
VMOVDQU64 (R11), Z25
|
|
VMOVDQU64 (R12), Z26
|
|
VMOVDQU64 (SI), Z27
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (DX), Z28
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z28, Z29
|
|
VXORPD Z21, Z29, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z28, Z29
|
|
VXORPD Z22, Z29, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z28, Z29
|
|
VXORPD Z23, Z29, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (BX), Z28
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
|
|
VXORPD Z21, Z29, Z21
|
|
VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
|
|
VXORPD Z22, Z29, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
|
|
VXORPD Z23, Z29, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (CX), Z28
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
|
|
VXORPD Z21, Z29, Z21
|
|
VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
|
|
VXORPD Z22, Z29, Z22
|
|
VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
|
|
VXORPD Z23, Z29, Z23
|
|
VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z21, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z22, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z23, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88
|
|
// Loading 7 of 21 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 30 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), DI
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_3x7Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU (R8), Y7
|
|
VMOVDQU (R9), Y8
|
|
VMOVDQU (R10), Y9
|
|
VMOVDQU (R11), Y10
|
|
VMOVDQU (R12), Y11
|
|
VMOVDQU (R13), Y12
|
|
VMOVDQU (DI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y8, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x8_64(SB), $0-88
|
|
// Loading 22 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 34 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), DI
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_3x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z22, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z23, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x8(SB), $0-88
|
|
// Loading 6 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 34 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), DI
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_3x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y7, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x8_64Xor(SB), $0-88
|
|
// Loading 22 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 34 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), DI
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_3x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU64 (R8), Z22
|
|
VMOVDQU64 (R9), Z23
|
|
VMOVDQU64 (R10), Z24
|
|
VMOVDQU64 (R11), Z25
|
|
VMOVDQU64 (R12), Z26
|
|
VMOVDQU64 (R13), Z27
|
|
VMOVDQU64 (R14), Z28
|
|
VMOVDQU64 (DI), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z22, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z23, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88
|
|
// Loading 6 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 34 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), DI
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_3x8Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU (R8), Y6
|
|
VMOVDQU (R9), Y7
|
|
VMOVDQU (R10), Y8
|
|
VMOVDQU (R11), Y9
|
|
VMOVDQU (R12), Y10
|
|
VMOVDQU (R13), Y11
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (DI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y7, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x9_64(SB), $8-88
|
|
// Loading 21 of 27 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), R15
|
|
MOVQ 192(DI), DI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_3x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU64 Z21, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x9(SB), $8-88
|
|
// Loading 5 of 27 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), R15
|
|
MOVQ 192(DI), DI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_3x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x9_64Xor(SB), $8-88
|
|
// Loading 21 of 27 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), R15
|
|
MOVQ 192(DI), DI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_3x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
VMOVDQU64 (R8), Z21
|
|
VMOVDQU64 (R9), Z22
|
|
VMOVDQU64 (R10), Z23
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (DI), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU64 Z21, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_3x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88
|
|
// Loading 5 of 27 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), R15
|
|
MOVQ 192(DI), DI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_3x9Xor_loop:
|
|
// Load 9 outputs
|
|
VMOVDQU (R8), Y5
|
|
VMOVDQU (R9), Y6
|
|
VMOVDQU (R10), Y7
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (DI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_3x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x10_64(SB), $8-88
|
|
// Loading 20 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), AX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), R14
|
|
MOVQ 192(SI), R15
|
|
MOVQ 216(SI), SI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_3x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU64 Z20, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z21, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_3x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x10(SB), $8-88
|
|
// Loading 4 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), AX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), R14
|
|
MOVQ 192(SI), R15
|
|
MOVQ 216(SI), SI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_3x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU Y4, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_3x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_3x10_64Xor(SB), $8-88
|
|
// Loading 20 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_3x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), AX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), R14
|
|
MOVQ 192(SI), R15
|
|
MOVQ 216(SI), SI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_3x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
VMOVDQU64 (DI), Z20
|
|
VMOVDQU64 (R8), Z21
|
|
VMOVDQU64 (R9), Z22
|
|
VMOVDQU64 (R10), Z23
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (SI), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU64 Z20, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z21, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (SI)
|
|
ADDQ $0x40, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_3x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_3x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88
|
|
// Loading 4 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_3x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), AX
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ out_base+48(FP), SI
|
|
MOVQ (SI), DI
|
|
MOVQ 24(SI), R8
|
|
MOVQ 48(SI), R9
|
|
MOVQ 72(SI), R10
|
|
MOVQ 96(SI), R11
|
|
MOVQ 120(SI), R12
|
|
MOVQ 144(SI), R13
|
|
MOVQ 168(SI), R14
|
|
MOVQ 192(SI), R15
|
|
MOVQ 216(SI), SI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, SI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_3x10Xor_loop:
|
|
// Load 10 outputs
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU (R8), Y5
|
|
VMOVDQU (R9), Y6
|
|
VMOVDQU (R10), Y7
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (SI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
VMOVDQU Y4, (DI)
|
|
ADDQ $0x20, DI
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (SI)
|
|
ADDQ $0x20, SI
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_3x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_3x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 7 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), DI
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, BX
|
|
ADDQ R8, SI
|
|
ADDQ R8, CX
|
|
|
|
mulGFNI_4x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z5
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z5, Z4
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z5
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z5
|
|
VXORPD Z4, Z5, Z4
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z5
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
|
|
VXORPD Z4, Z5, Z4
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (CX), Z5
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z5, Z5
|
|
VXORPD Z4, Z5, Z4
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z4, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 7 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), DI
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, BX
|
|
ADDQ R8, SI
|
|
ADDQ R8, CX
|
|
|
|
mulAvxGFNI_4x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y5
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y5, Y4
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y5
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y5, Y5
|
|
VXORPD Y4, Y5, Y4
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y5
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
|
|
VXORPD Y4, Y5, Y4
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (CX), Y5
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y3, Y5, Y5
|
|
VXORPD Y4, Y5, Y4
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y4, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 7 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), DI
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, BX
|
|
ADDQ R8, SI
|
|
ADDQ R8, CX
|
|
|
|
mulGFNI_4x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (DI), Z4
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z5
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z5, Z5
|
|
VXORPD Z4, Z5, Z4
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z5
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z5
|
|
VXORPD Z4, Z5, Z4
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z5
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
|
|
VXORPD Z4, Z5, Z4
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (CX), Z5
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z5, Z5
|
|
VXORPD Z4, Z5, Z4
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z4, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 7 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), DI
|
|
MOVQ start+72(FP), R8
|
|
|
|
// Add start offset to output
|
|
ADDQ R8, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R8, DX
|
|
ADDQ R8, BX
|
|
ADDQ R8, SI
|
|
ADDQ R8, CX
|
|
|
|
mulAvxGFNI_4x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (DI), Y4
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y5
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y5, Y5
|
|
VXORPD Y4, Y5, Y4
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y5
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y5, Y5
|
|
VXORPD Y4, Y5, Y4
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y5
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
|
|
VXORPD Y4, Y5, Y4
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (CX), Y5
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y3, Y5, Y5
|
|
VXORPD Y4, Y5, Y4
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y4, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), DI
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, R8
|
|
ADDQ R9, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_4x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z10
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z10, Z8
|
|
VGF2P8AFFINEQB $0x00, Z1, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z10
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
|
|
VXORPD Z8, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
|
|
VXORPD Z9, Z11, Z9
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z10
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
|
|
VXORPD Z8, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z5, Z10, Z11
|
|
VXORPD Z9, Z11, Z9
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (CX), Z10
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z10, Z11
|
|
VXORPD Z8, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z7, Z10, Z11
|
|
VXORPD Z9, Z11, Z9
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z8, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z9, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x2(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), DI
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, R8
|
|
ADDQ R9, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_4x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y10
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y10, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (BX), Y10
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
|
|
VXORPD Y8, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
|
|
VXORPD Y9, Y11, Y9
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (SI), Y10
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
|
|
VXORPD Y8, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y5, Y10, Y11
|
|
VXORPD Y9, Y11, Y9
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (CX), Y10
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y10, Y11
|
|
VXORPD Y8, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y10, Y11
|
|
VXORPD Y9, Y11, Y9
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), DI
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, R8
|
|
ADDQ R9, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_4x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (R8), Z8
|
|
VMOVDQU64 (DI), Z9
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z10
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z10, Z11
|
|
VXORPD Z8, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z1, Z10, Z11
|
|
VXORPD Z9, Z11, Z9
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z10
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
|
|
VXORPD Z8, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
|
|
VXORPD Z9, Z11, Z9
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z10
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
|
|
VXORPD Z8, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z5, Z10, Z11
|
|
VXORPD Z9, Z11, Z9
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (CX), Z10
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z10, Z11
|
|
VXORPD Z8, Z11, Z8
|
|
VGF2P8AFFINEQB $0x00, Z7, Z10, Z11
|
|
VXORPD Z9, Z11, Z9
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z8, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z9, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), DI
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, R8
|
|
ADDQ R9, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_4x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (R8), Y8
|
|
VMOVDQU (DI), Y9
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y10
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y10, Y11
|
|
VXORPD Y8, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y10, Y11
|
|
VXORPD Y9, Y11, Y9
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (BX), Y10
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
|
|
VXORPD Y8, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
|
|
VXORPD Y9, Y11, Y9
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (SI), Y10
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
|
|
VXORPD Y8, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y5, Y10, Y11
|
|
VXORPD Y9, Y11, Y9
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (CX), Y10
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y10, Y11
|
|
VXORPD Y8, Y11, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y10, Y11
|
|
VXORPD Y9, Y11, Y9
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y8, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y9, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 17 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), DI
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_4x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z15
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z1, Z15, Z13
|
|
VGF2P8AFFINEQB $0x00, Z2, Z15, Z14
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z15
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z15
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (CX), Z15
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z10, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z11, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z12, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z13, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z14, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x3(SB), $0-88
|
|
// Loading 11 of 12 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 17 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R8
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_4x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 17 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), DI
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R8
|
|
ADDQ R10, R9
|
|
ADDQ R10, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_4x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (R8), Z12
|
|
VMOVDQU64 (R9), Z13
|
|
VMOVDQU64 (DI), Z14
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z15
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z1, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z2, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z15
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z15
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (CX), Z15
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
|
|
VXORPD Z12, Z16, Z12
|
|
VGF2P8AFFINEQB $0x00, Z10, Z15, Z16
|
|
VXORPD Z13, Z16, Z13
|
|
VGF2P8AFFINEQB $0x00, Z11, Z15, Z16
|
|
VXORPD Z14, Z16, Z14
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z12, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z13, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z14, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88
|
|
// Loading 11 of 12 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 17 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R8
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_4x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (R9), Y11
|
|
VMOVDQU (R10), Y12
|
|
VMOVDQU (R8), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x4_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), DI
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_4x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z20
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z20, Z16
|
|
VGF2P8AFFINEQB $0x00, Z1, Z20, Z17
|
|
VGF2P8AFFINEQB $0x00, Z2, Z20, Z18
|
|
VGF2P8AFFINEQB $0x00, Z3, Z20, Z19
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z20
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (SI), Z20
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (CX), Z20
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z16, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z17, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z18, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z19, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x4(SB), $0-88
|
|
// Loading 10 of 16 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R8
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_4x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y12, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x4_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), DI
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_4x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (R8), Z16
|
|
VMOVDQU64 (R9), Z17
|
|
VMOVDQU64 (R10), Z18
|
|
VMOVDQU64 (DI), Z19
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z20
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z20
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (SI), Z20
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (CX), Z20
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
|
|
VXORPD Z16, Z21, Z16
|
|
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
|
|
VXORPD Z17, Z21, Z17
|
|
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z16, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z17, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z18, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z19, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88
|
|
// Loading 10 of 16 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R8
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_4x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (R9), Y10
|
|
VMOVDQU (R10), Y11
|
|
VMOVDQU (R11), Y12
|
|
VMOVDQU (R8), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y12, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x5_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 27 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), DI
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_4x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z25
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z25, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z25, Z24
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (BX), Z25
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z25, Z26
|
|
VXORPD Z20, Z26, Z20
|
|
VGF2P8AFFINEQB $0x00, Z6, Z25, Z26
|
|
VXORPD Z21, Z26, Z21
|
|
VGF2P8AFFINEQB $0x00, Z7, Z25, Z26
|
|
VXORPD Z22, Z26, Z22
|
|
VGF2P8AFFINEQB $0x00, Z8, Z25, Z26
|
|
VXORPD Z23, Z26, Z23
|
|
VGF2P8AFFINEQB $0x00, Z9, Z25, Z26
|
|
VXORPD Z24, Z26, Z24
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (SI), Z25
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z25, Z26
|
|
VXORPD Z20, Z26, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z25, Z26
|
|
VXORPD Z21, Z26, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z25, Z26
|
|
VXORPD Z22, Z26, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z25, Z26
|
|
VXORPD Z23, Z26, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z25, Z26
|
|
VXORPD Z24, Z26, Z24
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (CX), Z25
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z15, Z25, Z26
|
|
VXORPD Z20, Z26, Z20
|
|
VGF2P8AFFINEQB $0x00, Z16, Z25, Z26
|
|
VXORPD Z21, Z26, Z21
|
|
VGF2P8AFFINEQB $0x00, Z17, Z25, Z26
|
|
VXORPD Z22, Z26, Z22
|
|
VGF2P8AFFINEQB $0x00, Z18, Z25, Z26
|
|
VXORPD Z23, Z26, Z23
|
|
VGF2P8AFFINEQB $0x00, Z19, Z25, Z26
|
|
VXORPD Z24, Z26, Z24
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z20, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z21, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z22, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z23, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z24, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x5(SB), $0-88
|
|
// Loading 9 of 20 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 27 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R8
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_4x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x5_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 27 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), DI
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_4x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU64 (R8), Z20
|
|
VMOVDQU64 (R9), Z21
|
|
VMOVDQU64 (R10), Z22
|
|
VMOVDQU64 (R11), Z23
|
|
VMOVDQU64 (DI), Z24
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z25
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z25, Z26
|
|
VXORPD Z20, Z26, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z25, Z26
|
|
VXORPD Z21, Z26, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z25, Z26
|
|
VXORPD Z22, Z26, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z25, Z26
|
|
VXORPD Z23, Z26, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z25, Z26
|
|
VXORPD Z24, Z26, Z24
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (BX), Z25
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z25, Z26
|
|
VXORPD Z20, Z26, Z20
|
|
VGF2P8AFFINEQB $0x00, Z6, Z25, Z26
|
|
VXORPD Z21, Z26, Z21
|
|
VGF2P8AFFINEQB $0x00, Z7, Z25, Z26
|
|
VXORPD Z22, Z26, Z22
|
|
VGF2P8AFFINEQB $0x00, Z8, Z25, Z26
|
|
VXORPD Z23, Z26, Z23
|
|
VGF2P8AFFINEQB $0x00, Z9, Z25, Z26
|
|
VXORPD Z24, Z26, Z24
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (SI), Z25
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z25, Z26
|
|
VXORPD Z20, Z26, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z25, Z26
|
|
VXORPD Z21, Z26, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z25, Z26
|
|
VXORPD Z22, Z26, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z25, Z26
|
|
VXORPD Z23, Z26, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z25, Z26
|
|
VXORPD Z24, Z26, Z24
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (CX), Z25
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z15, Z25, Z26
|
|
VXORPD Z20, Z26, Z20
|
|
VGF2P8AFFINEQB $0x00, Z16, Z25, Z26
|
|
VXORPD Z21, Z26, Z21
|
|
VGF2P8AFFINEQB $0x00, Z17, Z25, Z26
|
|
VXORPD Z22, Z26, Z22
|
|
VGF2P8AFFINEQB $0x00, Z18, Z25, Z26
|
|
VXORPD Z23, Z26, Z23
|
|
VGF2P8AFFINEQB $0x00, Z19, Z25, Z26
|
|
VXORPD Z24, Z26, Z24
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z20, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z21, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z22, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z23, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z24, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88
|
|
// Loading 9 of 20 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 27 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R8
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_4x5Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU (R9), Y9
|
|
VMOVDQU (R10), Y10
|
|
VMOVDQU (R11), Y11
|
|
VMOVDQU (R12), Y12
|
|
VMOVDQU (R8), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x6_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), DI
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_4x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (CX), Z30
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z24, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z25, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z26, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z27, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z28, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z29, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x6(SB), $0-88
|
|
// Loading 8 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R8
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_4x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x6_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), CX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), DI
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_4x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU64 (R8), Z24
|
|
VMOVDQU64 (R9), Z25
|
|
VMOVDQU64 (R10), Z26
|
|
VMOVDQU64 (R11), Z27
|
|
VMOVDQU64 (R12), Z28
|
|
VMOVDQU64 (DI), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (CX), Z30
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z24, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z25, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z26, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z27, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z28, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z29, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88
|
|
// Loading 8 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R8
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_4x6Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU (R9), Y8
|
|
VMOVDQU (R10), Y9
|
|
VMOVDQU (R11), Y10
|
|
VMOVDQU (R12), Y11
|
|
VMOVDQU (R13), Y12
|
|
VMOVDQU (R8), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x7_64(SB), $0-88
|
|
// Loading 23 of 28 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 37 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R8
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_4x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z23, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x7(SB), $0-88
|
|
// Loading 7 of 28 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 37 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R8
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_4x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x7_64Xor(SB), $0-88
|
|
// Loading 23 of 28 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 37 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R8
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_4x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU64 (R9), Z23
|
|
VMOVDQU64 (R10), Z24
|
|
VMOVDQU64 (R11), Z25
|
|
VMOVDQU64 (R12), Z26
|
|
VMOVDQU64 (R13), Z27
|
|
VMOVDQU64 (R14), Z28
|
|
VMOVDQU64 (R8), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z23, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88
|
|
// Loading 7 of 28 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 37 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R8
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_4x7Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU (R9), Y7
|
|
VMOVDQU (R10), Y8
|
|
VMOVDQU (R11), Y9
|
|
VMOVDQU (R12), Y10
|
|
VMOVDQU (R13), Y11
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (R8), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x8_64(SB), $8-88
|
|
// Loading 22 of 32 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R15
|
|
MOVQ 168(R8), R8
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_4x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x8(SB), $8-88
|
|
// Loading 6 of 32 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R15
|
|
MOVQ 168(R8), R8
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_4x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x8_64Xor(SB), $8-88
|
|
// Loading 22 of 32 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R15
|
|
MOVQ 168(R8), R8
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_4x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU64 (R9), Z22
|
|
VMOVDQU64 (R10), Z23
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R8), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_4x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88
|
|
// Loading 6 of 32 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R15
|
|
MOVQ 168(R8), R8
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_4x8Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU (R9), Y6
|
|
VMOVDQU (R10), Y7
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R8), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x9_64(SB), $8-88
|
|
// Loading 21 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 47 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), AX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), R15
|
|
MOVQ 192(DI), DI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_4x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU64 Z21, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_4x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x9(SB), $8-88
|
|
// Loading 5 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 47 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), AX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), R15
|
|
MOVQ 192(DI), DI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_4x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_4x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x9_64Xor(SB), $8-88
|
|
// Loading 21 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 47 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), AX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), R15
|
|
MOVQ 192(DI), DI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_4x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
VMOVDQU64 (R8), Z21
|
|
VMOVDQU64 (R9), Z22
|
|
VMOVDQU64 (R10), Z23
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (DI), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU64 Z21, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (DI)
|
|
ADDQ $0x40, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_4x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88
|
|
// Loading 5 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 47 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), AX
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ out_base+48(FP), DI
|
|
MOVQ (DI), R8
|
|
MOVQ 24(DI), R9
|
|
MOVQ 48(DI), R10
|
|
MOVQ 72(DI), R11
|
|
MOVQ 96(DI), R12
|
|
MOVQ 120(DI), R13
|
|
MOVQ 144(DI), R14
|
|
MOVQ 168(DI), R15
|
|
MOVQ 192(DI), DI
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, DI
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_4x9Xor_loop:
|
|
// Load 9 outputs
|
|
VMOVDQU (R8), Y5
|
|
VMOVDQU (R9), Y6
|
|
VMOVDQU (R10), Y7
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (DI), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (DI)
|
|
ADDQ $0x20, DI
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_4x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x10_64(SB), $0-88
|
|
// Loading 20 of 40 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 52 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, DX
|
|
|
|
mulGFNI_4x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R8), R10
|
|
VMOVDQU64 Z20, (R10)(R9*1)
|
|
MOVQ 24(R8), R10
|
|
VMOVDQU64 Z21, (R10)(R9*1)
|
|
MOVQ 48(R8), R10
|
|
VMOVDQU64 Z22, (R10)(R9*1)
|
|
MOVQ 72(R8), R10
|
|
VMOVDQU64 Z23, (R10)(R9*1)
|
|
MOVQ 96(R8), R10
|
|
VMOVDQU64 Z24, (R10)(R9*1)
|
|
MOVQ 120(R8), R10
|
|
VMOVDQU64 Z25, (R10)(R9*1)
|
|
MOVQ 144(R8), R10
|
|
VMOVDQU64 Z26, (R10)(R9*1)
|
|
MOVQ 168(R8), R10
|
|
VMOVDQU64 Z27, (R10)(R9*1)
|
|
MOVQ 192(R8), R10
|
|
VMOVDQU64 Z28, (R10)(R9*1)
|
|
MOVQ 216(R8), R10
|
|
VMOVDQU64 Z29, (R10)(R9*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R9
|
|
DECQ AX
|
|
JNZ mulGFNI_4x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x10(SB), $0-88
|
|
// Loading 4 of 40 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 52 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, DX
|
|
|
|
mulAvxGFNI_4x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R8), R10
|
|
VMOVDQU Y4, (R10)(R9*1)
|
|
MOVQ 24(R8), R10
|
|
VMOVDQU Y5, (R10)(R9*1)
|
|
MOVQ 48(R8), R10
|
|
VMOVDQU Y6, (R10)(R9*1)
|
|
MOVQ 72(R8), R10
|
|
VMOVDQU Y7, (R10)(R9*1)
|
|
MOVQ 96(R8), R10
|
|
VMOVDQU Y8, (R10)(R9*1)
|
|
MOVQ 120(R8), R10
|
|
VMOVDQU Y9, (R10)(R9*1)
|
|
MOVQ 144(R8), R10
|
|
VMOVDQU Y10, (R10)(R9*1)
|
|
MOVQ 168(R8), R10
|
|
VMOVDQU Y11, (R10)(R9*1)
|
|
MOVQ 192(R8), R10
|
|
VMOVDQU Y12, (R10)(R9*1)
|
|
MOVQ 216(R8), R10
|
|
VMOVDQU Y13, (R10)(R9*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R9
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_4x10_64Xor(SB), $0-88
|
|
// Loading 20 of 40 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 52 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_4x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, DX
|
|
|
|
mulGFNI_4x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z20
|
|
MOVQ 24(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z21
|
|
MOVQ 48(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z22
|
|
MOVQ 72(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z23
|
|
MOVQ 96(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z24
|
|
MOVQ 120(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z25
|
|
MOVQ 144(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z26
|
|
MOVQ 168(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z27
|
|
MOVQ 192(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z28
|
|
MOVQ 216(R8), R10
|
|
VMOVDQU64 (R10)(R9*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R8), R10
|
|
VMOVDQU64 Z20, (R10)(R9*1)
|
|
MOVQ 24(R8), R10
|
|
VMOVDQU64 Z21, (R10)(R9*1)
|
|
MOVQ 48(R8), R10
|
|
VMOVDQU64 Z22, (R10)(R9*1)
|
|
MOVQ 72(R8), R10
|
|
VMOVDQU64 Z23, (R10)(R9*1)
|
|
MOVQ 96(R8), R10
|
|
VMOVDQU64 Z24, (R10)(R9*1)
|
|
MOVQ 120(R8), R10
|
|
VMOVDQU64 Z25, (R10)(R9*1)
|
|
MOVQ 144(R8), R10
|
|
VMOVDQU64 Z26, (R10)(R9*1)
|
|
MOVQ 168(R8), R10
|
|
VMOVDQU64 Z27, (R10)(R9*1)
|
|
MOVQ 192(R8), R10
|
|
VMOVDQU64 Z28, (R10)(R9*1)
|
|
MOVQ 216(R8), R10
|
|
VMOVDQU64 Z29, (R10)(R9*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R9
|
|
DECQ AX
|
|
JNZ mulGFNI_4x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_4x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88
|
|
// Loading 4 of 40 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 52 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_4x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), DX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, DX
|
|
|
|
mulAvxGFNI_4x10Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R8), R10
|
|
VMOVDQU (R10)(R9*1), Y4
|
|
MOVQ 24(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y5
|
|
MOVQ 48(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y6
|
|
MOVQ 72(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y7
|
|
MOVQ 96(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y8
|
|
MOVQ 120(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y9
|
|
MOVQ 144(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y10
|
|
MOVQ 168(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y11
|
|
MOVQ 192(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y12
|
|
MOVQ 216(R8), R10
|
|
VMOVDQU (R10)(R9*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R8), R10
|
|
VMOVDQU Y4, (R10)(R9*1)
|
|
MOVQ 24(R8), R10
|
|
VMOVDQU Y5, (R10)(R9*1)
|
|
MOVQ 48(R8), R10
|
|
VMOVDQU Y6, (R10)(R9*1)
|
|
MOVQ 72(R8), R10
|
|
VMOVDQU Y7, (R10)(R9*1)
|
|
MOVQ 96(R8), R10
|
|
VMOVDQU Y8, (R10)(R9*1)
|
|
MOVQ 120(R8), R10
|
|
VMOVDQU Y9, (R10)(R9*1)
|
|
MOVQ 144(R8), R10
|
|
VMOVDQU Y10, (R10)(R9*1)
|
|
MOVQ 168(R8), R10
|
|
VMOVDQU Y11, (R10)(R9*1)
|
|
MOVQ 192(R8), R10
|
|
VMOVDQU Y12, (R10)(R9*1)
|
|
MOVQ 216(R8), R10
|
|
VMOVDQU Y13, (R10)(R9*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R9
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_4x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_4x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R8
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_5x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z6
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z6
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z6
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z6
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (CX), Z6
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z5, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R8
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_5x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y6
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y6
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y6
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y6
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (CX), Y6
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y4, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R8
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, CX
|
|
|
|
mulGFNI_5x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (R8), Z5
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z6
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z6
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z6
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z6
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (CX), Z6
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z6, Z6
|
|
VXORPD Z5, Z6, Z5
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z5, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 8 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R8
|
|
MOVQ start+72(FP), R9
|
|
|
|
// Add start offset to output
|
|
ADDQ R9, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R9, DX
|
|
ADDQ R9, BX
|
|
ADDQ R9, SI
|
|
ADDQ R9, DI
|
|
ADDQ R9, CX
|
|
|
|
mulAvxGFNI_5x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (R8), Y5
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y6
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y6
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y6
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y6
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (CX), Y6
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y4, Y6, Y6
|
|
VXORPD Y5, Y6, Y5
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y5, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R8
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R9
|
|
ADDQ R10, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_5x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z12
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z12, Z10
|
|
VGF2P8AFFINEQB $0x00, Z1, Z12, Z11
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z12
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z12
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z12
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (CX), Z12
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z9, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z10, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z11, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x2(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R8
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R9
|
|
ADDQ R10, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, CX
|
|
|
|
mulAvxGFNI_5x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y12
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y12, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y12, Y11
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (BX), Y12
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (SI), Y12
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (DI), Y12
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (CX), Y12
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R8
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R9
|
|
ADDQ R10, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_5x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (R9), Z10
|
|
VMOVDQU64 (R8), Z11
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z12
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z12
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z12
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z12
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (CX), Z12
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
|
|
VXORPD Z10, Z13, Z10
|
|
VGF2P8AFFINEQB $0x00, Z9, Z12, Z13
|
|
VXORPD Z11, Z13, Z11
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z10, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z11, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 14 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R8
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R9
|
|
ADDQ R10, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, CX
|
|
|
|
mulAvxGFNI_5x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (R9), Y10
|
|
VMOVDQU (R8), Y11
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y12
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (BX), Y12
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (SI), Y12
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (DI), Y12
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (CX), Y12
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
|
|
VXORPD Y10, Y13, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y12, Y13
|
|
VXORPD Y11, Y13, Y11
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y10, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y11, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R8
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_5x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z18
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z18, Z15
|
|
VGF2P8AFFINEQB $0x00, Z1, Z18, Z16
|
|
VGF2P8AFFINEQB $0x00, Z2, Z18, Z17
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z18
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z18
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z18
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (CX), Z18
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z15, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z16, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z17, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x3(SB), $0-88
|
|
// Loading 11 of 15 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R9
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_5x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y12, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R8
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R9
|
|
ADDQ R11, R10
|
|
ADDQ R11, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_5x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (R9), Z15
|
|
VMOVDQU64 (R10), Z16
|
|
VMOVDQU64 (R8), Z17
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z18
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z18
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z18
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z18
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (CX), Z18
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
|
|
VXORPD Z15, Z19, Z15
|
|
VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z15, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z16, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z17, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88
|
|
// Loading 11 of 15 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R9
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_5x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (R10), Y11
|
|
VMOVDQU (R11), Y12
|
|
VMOVDQU (R9), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y12, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x4_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R8
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_5x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z24
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z24, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z24, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z24, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z24, Z23
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z24
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (SI), Z24
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (DI), Z24
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (CX), Z24
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z20, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z21, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z22, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z23, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x4(SB), $0-88
|
|
// Loading 10 of 20 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R9
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_5x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x4_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R8
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_5x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (R9), Z20
|
|
VMOVDQU64 (R10), Z21
|
|
VMOVDQU64 (R11), Z22
|
|
VMOVDQU64 (R8), Z23
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z24
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z24
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (SI), Z24
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (DI), Z24
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (CX), Z24
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
|
|
VXORPD Z20, Z25, Z20
|
|
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z20, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z21, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z22, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z23, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88
|
|
// Loading 10 of 20 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R9
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_5x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (R10), Y10
|
|
VMOVDQU (R11), Y11
|
|
VMOVDQU (R12), Y12
|
|
VMOVDQU (R9), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x5_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R8
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_5x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (CX), Z30
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z25, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z26, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z27, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z28, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z29, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x5(SB), $0-88
|
|
// Loading 9 of 25 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R9
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_5x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x5_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), CX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R8
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_5x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU64 (R9), Z25
|
|
VMOVDQU64 (R10), Z26
|
|
VMOVDQU64 (R11), Z27
|
|
VMOVDQU64 (R12), Z28
|
|
VMOVDQU64 (R8), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (CX), Z30
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z25, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z26, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z27, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z28, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z29, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88
|
|
// Loading 9 of 25 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R9
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_5x5Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU (R10), Y9
|
|
VMOVDQU (R11), Y10
|
|
VMOVDQU (R12), Y11
|
|
VMOVDQU (R13), Y12
|
|
VMOVDQU (R9), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x6_64(SB), $0-88
|
|
// Loading 24 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R9
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_5x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x6(SB), $0-88
|
|
// Loading 8 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R9
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_5x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x6_64Xor(SB), $0-88
|
|
// Loading 24 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R9
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_5x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU64 (R10), Z24
|
|
VMOVDQU64 (R11), Z25
|
|
VMOVDQU64 (R12), Z26
|
|
VMOVDQU64 (R13), Z27
|
|
VMOVDQU64 (R14), Z28
|
|
VMOVDQU64 (R9), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88
|
|
// Loading 8 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R9
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_5x6Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU (R10), Y8
|
|
VMOVDQU (R11), Y9
|
|
VMOVDQU (R12), Y10
|
|
VMOVDQU (R13), Y11
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (R9), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x7_64(SB), $8-88
|
|
// Loading 23 of 35 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 44 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R15
|
|
MOVQ 144(R9), R9
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_5x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x7(SB), $8-88
|
|
// Loading 7 of 35 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 44 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R15
|
|
MOVQ 144(R9), R9
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_5x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x7_64Xor(SB), $8-88
|
|
// Loading 23 of 35 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 44 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R15
|
|
MOVQ 144(R9), R9
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_5x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU64 (R10), Z23
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R9), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_5x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88
|
|
// Loading 7 of 35 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 44 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R15
|
|
MOVQ 144(R9), R9
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_5x7Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU (R10), Y7
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R9), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x8_64(SB), $8-88
|
|
// Loading 22 of 40 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 50 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), AX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R15
|
|
MOVQ 168(R8), R8
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_5x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_5x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x8(SB), $8-88
|
|
// Loading 6 of 40 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 50 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), AX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R15
|
|
MOVQ 168(R8), R8
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_5x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_5x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x8_64Xor(SB), $8-88
|
|
// Loading 22 of 40 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 50 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), AX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R15
|
|
MOVQ 168(R8), R8
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_5x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU64 (R9), Z22
|
|
VMOVDQU64 (R10), Z23
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R8), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU64 Z22, (R9)
|
|
ADDQ $0x40, R9
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R8)
|
|
ADDQ $0x40, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_5x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88
|
|
// Loading 6 of 40 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 50 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), AX
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ out_base+48(FP), R8
|
|
MOVQ (R8), R9
|
|
MOVQ 24(R8), R10
|
|
MOVQ 48(R8), R11
|
|
MOVQ 72(R8), R12
|
|
MOVQ 96(R8), R13
|
|
MOVQ 120(R8), R14
|
|
MOVQ 144(R8), R15
|
|
MOVQ 168(R8), R8
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R8
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_5x8Xor_loop:
|
|
// Load 8 outputs
|
|
VMOVDQU (R9), Y6
|
|
VMOVDQU (R10), Y7
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R8), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R8)
|
|
ADDQ $0x20, R8
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_5x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x9_64(SB), $0-88
|
|
// Loading 21 of 45 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 56 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, DX
|
|
|
|
mulGFNI_5x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU64 Z21, (R11)(R10*1)
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU64 Z22, (R11)(R10*1)
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU64 Z23, (R11)(R10*1)
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU64 Z24, (R11)(R10*1)
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU64 Z25, (R11)(R10*1)
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU64 Z26, (R11)(R10*1)
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU64 Z27, (R11)(R10*1)
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU64 Z28, (R11)(R10*1)
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU64 Z29, (R11)(R10*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R10
|
|
DECQ AX
|
|
JNZ mulGFNI_5x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x9(SB), $0-88
|
|
// Loading 5 of 45 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 56 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, DX
|
|
|
|
mulAvxGFNI_5x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU Y5, (R11)(R10*1)
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU Y6, (R11)(R10*1)
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU Y7, (R11)(R10*1)
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU Y8, (R11)(R10*1)
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU Y9, (R11)(R10*1)
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU Y10, (R11)(R10*1)
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU Y11, (R11)(R10*1)
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU Y12, (R11)(R10*1)
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU Y13, (R11)(R10*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R10
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x9_64Xor(SB), $0-88
|
|
// Loading 21 of 45 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 56 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, DX
|
|
|
|
mulGFNI_5x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z21
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z22
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z23
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z24
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z25
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z26
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z27
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z28
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU64 Z21, (R11)(R10*1)
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU64 Z22, (R11)(R10*1)
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU64 Z23, (R11)(R10*1)
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU64 Z24, (R11)(R10*1)
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU64 Z25, (R11)(R10*1)
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU64 Z26, (R11)(R10*1)
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU64 Z27, (R11)(R10*1)
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU64 Z28, (R11)(R10*1)
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU64 Z29, (R11)(R10*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R10
|
|
DECQ AX
|
|
JNZ mulGFNI_5x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88
|
|
// Loading 5 of 45 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 56 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, DX
|
|
|
|
mulAvxGFNI_5x9Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU (R11)(R10*1), Y5
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y6
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y7
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y8
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y9
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y10
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y11
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y12
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU Y5, (R11)(R10*1)
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU Y6, (R11)(R10*1)
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU Y7, (R11)(R10*1)
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU Y8, (R11)(R10*1)
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU Y9, (R11)(R10*1)
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU Y10, (R11)(R10*1)
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU Y11, (R11)(R10*1)
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU Y12, (R11)(R10*1)
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU Y13, (R11)(R10*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R10
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x10_64(SB), $0-88
|
|
// Loading 20 of 50 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 62 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, DX
|
|
|
|
mulGFNI_5x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU64 Z20, (R11)(R10*1)
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU64 Z21, (R11)(R10*1)
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU64 Z22, (R11)(R10*1)
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU64 Z23, (R11)(R10*1)
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU64 Z24, (R11)(R10*1)
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU64 Z25, (R11)(R10*1)
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU64 Z26, (R11)(R10*1)
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU64 Z27, (R11)(R10*1)
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU64 Z28, (R11)(R10*1)
|
|
MOVQ 216(R9), R11
|
|
VMOVDQU64 Z29, (R11)(R10*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R10
|
|
DECQ AX
|
|
JNZ mulGFNI_5x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x10(SB), $0-88
|
|
// Loading 4 of 50 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 62 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, DX
|
|
|
|
mulAvxGFNI_5x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU Y4, (R11)(R10*1)
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU Y5, (R11)(R10*1)
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU Y6, (R11)(R10*1)
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU Y7, (R11)(R10*1)
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU Y8, (R11)(R10*1)
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU Y9, (R11)(R10*1)
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU Y10, (R11)(R10*1)
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU Y11, (R11)(R10*1)
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU Y12, (R11)(R10*1)
|
|
MOVQ 216(R9), R11
|
|
VMOVDQU Y13, (R11)(R10*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R10
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_5x10_64Xor(SB), $0-88
|
|
// Loading 20 of 50 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 62 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_5x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, DX
|
|
|
|
mulGFNI_5x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z20
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z21
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z22
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z23
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z24
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z25
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z26
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z27
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z28
|
|
MOVQ 216(R9), R11
|
|
VMOVDQU64 (R11)(R10*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU64 Z20, (R11)(R10*1)
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU64 Z21, (R11)(R10*1)
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU64 Z22, (R11)(R10*1)
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU64 Z23, (R11)(R10*1)
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU64 Z24, (R11)(R10*1)
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU64 Z25, (R11)(R10*1)
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU64 Z26, (R11)(R10*1)
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU64 Z27, (R11)(R10*1)
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU64 Z28, (R11)(R10*1)
|
|
MOVQ 216(R9), R11
|
|
VMOVDQU64 Z29, (R11)(R10*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R10
|
|
DECQ AX
|
|
JNZ mulGFNI_5x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_5x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88
|
|
// Loading 4 of 50 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 62 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_5x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), DX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, DX
|
|
|
|
mulAvxGFNI_5x10Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU (R11)(R10*1), Y4
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y5
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y6
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y7
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y8
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y9
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y10
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y11
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y12
|
|
MOVQ 216(R9), R11
|
|
VMOVDQU (R11)(R10*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R9), R11
|
|
VMOVDQU Y4, (R11)(R10*1)
|
|
MOVQ 24(R9), R11
|
|
VMOVDQU Y5, (R11)(R10*1)
|
|
MOVQ 48(R9), R11
|
|
VMOVDQU Y6, (R11)(R10*1)
|
|
MOVQ 72(R9), R11
|
|
VMOVDQU Y7, (R11)(R10*1)
|
|
MOVQ 96(R9), R11
|
|
VMOVDQU Y8, (R11)(R10*1)
|
|
MOVQ 120(R9), R11
|
|
VMOVDQU Y9, (R11)(R10*1)
|
|
MOVQ 144(R9), R11
|
|
VMOVDQU Y10, (R11)(R10*1)
|
|
MOVQ 168(R9), R11
|
|
VMOVDQU Y11, (R11)(R10*1)
|
|
MOVQ 192(R9), R11
|
|
VMOVDQU Y12, (R11)(R10*1)
|
|
MOVQ 216(R9), R11
|
|
VMOVDQU Y13, (R11)(R10*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R10
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_5x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_5x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 9 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_6x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z7
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z7
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z7
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z7
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z7
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (CX), Z7
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z6, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 9 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, CX
|
|
|
|
mulAvxGFNI_6x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y7
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y7
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y7
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y7
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y7
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (CX), Y7
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y5, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 9 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, CX
|
|
|
|
mulGFNI_6x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (R9), Z6
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z7
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z7
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z7
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z7
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z7
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (CX), Z7
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z7, Z7
|
|
VXORPD Z6, Z7, Z6
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z6, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 9 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R9
|
|
MOVQ start+72(FP), R10
|
|
|
|
// Add start offset to output
|
|
ADDQ R10, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R10, DX
|
|
ADDQ R10, BX
|
|
ADDQ R10, SI
|
|
ADDQ R10, DI
|
|
ADDQ R10, R8
|
|
ADDQ R10, CX
|
|
|
|
mulAvxGFNI_6x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (R9), Y6
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y7
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y7
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y7
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y7
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y7
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (CX), Y7
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y5, Y7, Y7
|
|
VXORPD Y6, Y7, Y6
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y6, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 16 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R9
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R10
|
|
ADDQ R11, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_6x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z14
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z14, Z12
|
|
VGF2P8AFFINEQB $0x00, Z1, Z14, Z13
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z14
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z14
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z14
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z7, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z14
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z9, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (CX), Z14
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z10, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z11, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z12, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z13, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x2(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 16 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R9
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R10
|
|
ADDQ R11, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, CX
|
|
|
|
mulAvxGFNI_6x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (CX), Y14
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 16 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R9
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R10
|
|
ADDQ R11, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_6x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (R10), Z12
|
|
VMOVDQU64 (R9), Z13
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z14
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z1, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z14
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z14
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z14
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z7, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z14
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z9, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (CX), Z14
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z10, Z14, Z15
|
|
VXORPD Z12, Z15, Z12
|
|
VGF2P8AFFINEQB $0x00, Z11, Z14, Z15
|
|
VXORPD Z13, Z15, Z13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z12, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z13, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 16 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R9
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R10
|
|
ADDQ R11, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, CX
|
|
|
|
mulAvxGFNI_6x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (R10), Y12
|
|
VMOVDQU (R9), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (CX), Y14
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 23 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R9
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_6x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z21
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z1, Z21, Z19
|
|
VGF2P8AFFINEQB $0x00, Z2, Z21, Z20
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z21
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z21
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z21
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z21
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z14, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (CX), Z21
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z15, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z16, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z17, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z18, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z19, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z20, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x3(SB), $0-88
|
|
// Loading 11 of 18 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 23 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R10
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_6x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 23 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R9
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R10
|
|
ADDQ R12, R11
|
|
ADDQ R12, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_6x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (R10), Z18
|
|
VMOVDQU64 (R11), Z19
|
|
VMOVDQU64 (R9), Z20
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z21
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z1, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z2, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z21
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z21
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z21
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z21
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z14, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (CX), Z21
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z15, Z21, Z22
|
|
VXORPD Z18, Z22, Z18
|
|
VGF2P8AFFINEQB $0x00, Z16, Z21, Z22
|
|
VXORPD Z19, Z22, Z19
|
|
VGF2P8AFFINEQB $0x00, Z17, Z21, Z22
|
|
VXORPD Z20, Z22, Z20
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z18, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z19, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z20, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88
|
|
// Loading 11 of 18 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 23 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R10
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_6x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (R11), Y11
|
|
VMOVDQU (R12), Y12
|
|
VMOVDQU (R10), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x4_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 30 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R9
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_6x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z28
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z28, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z28, Z27
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z28
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (SI), Z28
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (DI), Z28
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R8), Z28
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (CX), Z28
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z21, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z22, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z23, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x4(SB), $0-88
|
|
// Loading 10 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 30 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R10
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_6x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x4_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 30 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), CX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R9
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_6x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (R10), Z24
|
|
VMOVDQU64 (R11), Z25
|
|
VMOVDQU64 (R12), Z26
|
|
VMOVDQU64 (R9), Z27
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z28
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z28
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (SI), Z28
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (DI), Z28
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R8), Z28
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (CX), Z28
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
|
|
VXORPD Z24, Z29, Z24
|
|
VGF2P8AFFINEQB $0x00, Z21, Z28, Z29
|
|
VXORPD Z25, Z29, Z25
|
|
VGF2P8AFFINEQB $0x00, Z22, Z28, Z29
|
|
VXORPD Z26, Z29, Z26
|
|
VGF2P8AFFINEQB $0x00, Z23, Z28, Z29
|
|
VXORPD Z27, Z29, Z27
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z24, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88
|
|
// Loading 10 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 30 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R10
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_6x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (R11), Y10
|
|
VMOVDQU (R12), Y11
|
|
VMOVDQU (R13), Y12
|
|
VMOVDQU (R10), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x5_64(SB), $0-88
|
|
// Loading 25 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 37 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R10
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_6x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x5(SB), $0-88
|
|
// Loading 9 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 37 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R10
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_6x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x5_64Xor(SB), $0-88
|
|
// Loading 25 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 37 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R10
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_6x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU64 (R11), Z25
|
|
VMOVDQU64 (R12), Z26
|
|
VMOVDQU64 (R13), Z27
|
|
VMOVDQU64 (R14), Z28
|
|
VMOVDQU64 (R10), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z25, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88
|
|
// Loading 9 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 37 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R10
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_6x5Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU (R11), Y9
|
|
VMOVDQU (R12), Y10
|
|
VMOVDQU (R13), Y11
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (R10), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x6_64(SB), $8-88
|
|
// Loading 24 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 44 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R15
|
|
MOVQ 120(R10), R10
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_6x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x6(SB), $8-88
|
|
// Loading 8 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 44 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R15
|
|
MOVQ 120(R10), R10
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_6x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x6_64Xor(SB), $8-88
|
|
// Loading 24 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 44 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R15
|
|
MOVQ 120(R10), R10
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_6x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R10), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_6x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88
|
|
// Loading 8 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 44 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R15
|
|
MOVQ 120(R10), R10
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_6x6Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R10), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x7_64(SB), $8-88
|
|
// Loading 23 of 42 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 51 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), AX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R15
|
|
MOVQ 144(R9), R9
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_6x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_6x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x7(SB), $8-88
|
|
// Loading 7 of 42 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 51 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), AX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R15
|
|
MOVQ 144(R9), R9
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_6x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_6x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x7_64Xor(SB), $8-88
|
|
// Loading 23 of 42 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 51 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), AX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R15
|
|
MOVQ 144(R9), R9
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_6x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU64 (R10), Z23
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R9), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R9)
|
|
ADDQ $0x40, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_6x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88
|
|
// Loading 7 of 42 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 51 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), AX
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ out_base+48(FP), R9
|
|
MOVQ (R9), R10
|
|
MOVQ 24(R9), R11
|
|
MOVQ 48(R9), R12
|
|
MOVQ 72(R9), R13
|
|
MOVQ 96(R9), R14
|
|
MOVQ 120(R9), R15
|
|
MOVQ 144(R9), R9
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R9
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_6x7Xor_loop:
|
|
// Load 7 outputs
|
|
VMOVDQU (R10), Y7
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R9), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R9)
|
|
ADDQ $0x20, R9
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_6x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x8_64(SB), $0-88
|
|
// Loading 22 of 48 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 58 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulGFNI_6x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 Z22, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 Z23, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 Z24, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 Z25, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 Z26, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 Z27, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 Z28, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 Z29, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R11
|
|
DECQ AX
|
|
JNZ mulGFNI_6x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x8(SB), $0-88
|
|
// Loading 6 of 48 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 58 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_6x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU Y6, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU Y7, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU Y8, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU Y9, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU Y10, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU Y11, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU Y12, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU Y13, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R11
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x8_64Xor(SB), $0-88
|
|
// Loading 22 of 48 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 58 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulGFNI_6x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z22
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z23
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z24
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z25
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z26
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z27
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z28
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 Z22, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 Z23, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 Z24, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 Z25, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 Z26, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 Z27, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 Z28, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 Z29, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R11
|
|
DECQ AX
|
|
JNZ mulGFNI_6x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88
|
|
// Loading 6 of 48 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 58 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_6x8Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU (R12)(R11*1), Y6
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y7
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y8
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y9
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y10
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y11
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y12
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU Y6, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU Y7, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU Y8, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU Y9, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU Y10, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU Y11, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU Y12, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU Y13, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R11
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x9_64(SB), $0-88
|
|
// Loading 21 of 54 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 65 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulGFNI_6x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 Z21, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 Z22, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 Z23, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 Z24, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 Z25, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 Z26, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 Z27, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 Z28, (R12)(R11*1)
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU64 Z29, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R11
|
|
DECQ AX
|
|
JNZ mulGFNI_6x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x9(SB), $0-88
|
|
// Loading 5 of 54 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 65 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_6x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU Y5, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU Y6, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU Y7, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU Y8, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU Y9, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU Y10, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU Y11, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU Y12, (R12)(R11*1)
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU Y13, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R11
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x9_64Xor(SB), $0-88
|
|
// Loading 21 of 54 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 65 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulGFNI_6x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z21
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z22
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z23
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z24
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z25
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z26
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z27
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z28
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 Z21, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 Z22, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 Z23, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 Z24, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 Z25, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 Z26, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 Z27, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 Z28, (R12)(R11*1)
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU64 Z29, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R11
|
|
DECQ AX
|
|
JNZ mulGFNI_6x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88
|
|
// Loading 5 of 54 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 65 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_6x9Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU (R12)(R11*1), Y5
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y6
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y7
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y8
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y9
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y10
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y11
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y12
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU Y5, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU Y6, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU Y7, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU Y8, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU Y9, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU Y10, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU Y11, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU Y12, (R12)(R11*1)
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU Y13, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R11
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x10_64(SB), $0-88
|
|
// Loading 20 of 60 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 72 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulGFNI_6x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 Z20, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 Z21, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 Z22, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 Z23, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 Z24, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 Z25, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 Z26, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 Z27, (R12)(R11*1)
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU64 Z28, (R12)(R11*1)
|
|
MOVQ 216(R10), R12
|
|
VMOVDQU64 Z29, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R11
|
|
DECQ AX
|
|
JNZ mulGFNI_6x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x10(SB), $0-88
|
|
// Loading 4 of 60 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 72 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_6x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU Y4, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU Y5, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU Y6, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU Y7, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU Y8, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU Y9, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU Y10, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU Y11, (R12)(R11*1)
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU Y12, (R12)(R11*1)
|
|
MOVQ 216(R10), R12
|
|
VMOVDQU Y13, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R11
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_6x10_64Xor(SB), $0-88
|
|
// Loading 20 of 60 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 72 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_6x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulGFNI_6x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z20
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z21
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z22
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z23
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z24
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z25
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z26
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z27
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z28
|
|
MOVQ 216(R10), R12
|
|
VMOVDQU64 (R12)(R11*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU64 Z20, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU64 Z21, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU64 Z22, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU64 Z23, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU64 Z24, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU64 Z25, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU64 Z26, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU64 Z27, (R12)(R11*1)
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU64 Z28, (R12)(R11*1)
|
|
MOVQ 216(R10), R12
|
|
VMOVDQU64 Z29, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R11
|
|
DECQ AX
|
|
JNZ mulGFNI_6x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_6x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88
|
|
// Loading 4 of 60 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 72 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_6x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), DX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, DX
|
|
|
|
mulAvxGFNI_6x10Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU (R12)(R11*1), Y4
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y5
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y6
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y7
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y8
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y9
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y10
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y11
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y12
|
|
MOVQ 216(R10), R12
|
|
VMOVDQU (R12)(R11*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R10), R12
|
|
VMOVDQU Y4, (R12)(R11*1)
|
|
MOVQ 24(R10), R12
|
|
VMOVDQU Y5, (R12)(R11*1)
|
|
MOVQ 48(R10), R12
|
|
VMOVDQU Y6, (R12)(R11*1)
|
|
MOVQ 72(R10), R12
|
|
VMOVDQU Y7, (R12)(R11*1)
|
|
MOVQ 96(R10), R12
|
|
VMOVDQU Y8, (R12)(R11*1)
|
|
MOVQ 120(R10), R12
|
|
VMOVDQU Y9, (R12)(R11*1)
|
|
MOVQ 144(R10), R12
|
|
VMOVDQU Y10, (R12)(R11*1)
|
|
MOVQ 168(R10), R12
|
|
VMOVDQU Y11, (R12)(R11*1)
|
|
MOVQ 192(R10), R12
|
|
VMOVDQU Y12, (R12)(R11*1)
|
|
MOVQ 216(R10), R12
|
|
VMOVDQU Y13, (R12)(R11*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R11
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_6x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_6x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), CX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_7x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z8
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z8
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z8
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z8
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z8
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (R9), Z8
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z5, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 6 to 1 outputs
|
|
VMOVDQU64 (CX), Z8
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z7, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), CX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, CX
|
|
|
|
mulAvxGFNI_7x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y8
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y8
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y8
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y8
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y8
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (R9), Y8
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y5, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 6 to 1 outputs
|
|
VMOVDQU (CX), Y8
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), CX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, CX
|
|
|
|
mulGFNI_7x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (R10), Z7
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z8
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z8
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z8
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z8
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z8
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (R9), Z8
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z5, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Load and process 64 bytes from input 6 to 1 outputs
|
|
VMOVDQU64 (CX), Z8
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z8, Z8
|
|
VXORPD Z7, Z8, Z7
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z7, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 10 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), CX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R10
|
|
MOVQ start+72(FP), R11
|
|
|
|
// Add start offset to output
|
|
ADDQ R11, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R11, DX
|
|
ADDQ R11, BX
|
|
ADDQ R11, SI
|
|
ADDQ R11, DI
|
|
ADDQ R11, R8
|
|
ADDQ R11, R9
|
|
ADDQ R11, CX
|
|
|
|
mulAvxGFNI_7x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (R10), Y7
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y8
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y8
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y8
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y8
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y8
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (R9), Y8
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y5, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Load and process 32 bytes from input 6 to 1 outputs
|
|
VMOVDQU (CX), Y8
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y8, Y8
|
|
VXORPD Y7, Y8, Y7
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y7, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), CX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R10
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R11
|
|
ADDQ R12, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_7x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z16
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z16, Z14
|
|
VGF2P8AFFINEQB $0x00, Z1, Z16, Z15
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z16
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z16
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z16
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z16
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (R9), Z16
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 6 to 2 outputs
|
|
VMOVDQU64 (CX), Z16
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z12, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z13, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z14, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z15, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x2(SB), $0-88
|
|
// Loading 12 of 14 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R11
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R12
|
|
ADDQ R13, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_7x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), CX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R10
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R11
|
|
ADDQ R12, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_7x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (R11), Z14
|
|
VMOVDQU64 (R10), Z15
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z16
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z16
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z16
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z16
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z16
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (R9), Z16
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Load and process 64 bytes from input 6 to 2 outputs
|
|
VMOVDQU64 (CX), Z16
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z12, Z16, Z17
|
|
VXORPD Z14, Z17, Z14
|
|
VGF2P8AFFINEQB $0x00, Z13, Z16, Z17
|
|
VXORPD Z15, Z17, Z15
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z14, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z15, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88
|
|
// Loading 12 of 14 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 18 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R11
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R12
|
|
ADDQ R13, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_7x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (R12), Y12
|
|
VMOVDQU (R11), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), CX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R10
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_7x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z24
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z24, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z24, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z24, Z23
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z24
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z24
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z24
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z24
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (R9), Z24
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 6 to 3 outputs
|
|
VMOVDQU64 (CX), Z24
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z21, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z22, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x3(SB), $0-88
|
|
// Loading 11 of 21 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R11
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_7x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), CX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R10
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R11
|
|
ADDQ R13, R12
|
|
ADDQ R13, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_7x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (R11), Z21
|
|
VMOVDQU64 (R12), Z22
|
|
VMOVDQU64 (R10), Z23
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z24
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z24
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z24
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z24
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z24
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (R9), Z24
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Load and process 64 bytes from input 6 to 3 outputs
|
|
VMOVDQU64 (CX), Z24
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
|
|
VXORPD Z21, Z25, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
|
|
VXORPD Z22, Z25, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z24, Z25
|
|
VXORPD Z23, Z25, Z23
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z21, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z22, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z23, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88
|
|
// Loading 11 of 21 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 26 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R11
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_7x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (R12), Y11
|
|
VMOVDQU (R13), Y12
|
|
VMOVDQU (R11), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x4_64(SB), $0-88
|
|
// Loading 26 of 28 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 34 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R11
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_7x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 4 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x4(SB), $0-88
|
|
// Loading 10 of 28 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 34 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R11
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_7x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x4_64Xor(SB), $0-88
|
|
// Loading 26 of 28 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 34 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R11
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_7x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (R12), Z26
|
|
VMOVDQU64 (R13), Z27
|
|
VMOVDQU64 (R14), Z28
|
|
VMOVDQU64 (R11), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 4 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z26, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88
|
|
// Loading 10 of 28 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 34 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R11
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_7x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (R12), Y10
|
|
VMOVDQU (R13), Y11
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (R11), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x5_64(SB), $8-88
|
|
// Loading 25 of 35 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R15
|
|
MOVQ 96(R11), R11
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_7x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x5(SB), $8-88
|
|
// Loading 9 of 35 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R15
|
|
MOVQ 96(R11), R11
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_7x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x5_64Xor(SB), $8-88
|
|
// Loading 25 of 35 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R15
|
|
MOVQ 96(R11), R11
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_7x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R11), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_7x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88
|
|
// Loading 9 of 35 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R15
|
|
MOVQ 96(R11), R11
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_7x5Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R11), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x6_64(SB), $8-88
|
|
// Loading 24 of 42 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 50 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), AX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R15
|
|
MOVQ 120(R10), R10
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_7x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 6 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_7x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x6(SB), $8-88
|
|
// Loading 8 of 42 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 50 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), AX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R15
|
|
MOVQ 120(R10), R10
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_7x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 6 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_7x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x6_64Xor(SB), $8-88
|
|
// Loading 24 of 42 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 50 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), AX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R15
|
|
MOVQ 120(R10), R10
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_7x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU64 (R11), Z24
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R10), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 6 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU64 Z24, (R11)
|
|
ADDQ $0x40, R11
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R10)
|
|
ADDQ $0x40, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_7x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88
|
|
// Loading 8 of 42 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 50 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), AX
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ out_base+48(FP), R10
|
|
MOVQ (R10), R11
|
|
MOVQ 24(R10), R12
|
|
MOVQ 48(R10), R13
|
|
MOVQ 72(R10), R14
|
|
MOVQ 96(R10), R15
|
|
MOVQ 120(R10), R10
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R10
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_7x6Xor_loop:
|
|
// Load 6 outputs
|
|
VMOVDQU (R11), Y8
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R10), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 6 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R10)
|
|
ADDQ $0x20, R10
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_7x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x7_64(SB), $0-88
|
|
// Loading 23 of 49 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 58 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulGFNI_7x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 Z23, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 Z24, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 Z25, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 Z26, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 Z27, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 Z28, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 Z29, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R12
|
|
DECQ AX
|
|
JNZ mulGFNI_7x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x7(SB), $0-88
|
|
// Loading 7 of 49 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 58 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_7x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU Y7, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU Y8, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU Y9, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU Y10, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU Y11, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU Y12, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU Y13, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R12
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x7_64Xor(SB), $0-88
|
|
// Loading 23 of 49 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 58 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulGFNI_7x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z23
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z24
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z25
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z26
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z27
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z28
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 Z23, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 Z24, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 Z25, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 Z26, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 Z27, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 Z28, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 Z29, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R12
|
|
DECQ AX
|
|
JNZ mulGFNI_7x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88
|
|
// Loading 7 of 49 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 58 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_7x7Xor_loop:
|
|
// Load 7 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU (R13)(R12*1), Y7
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y8
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y9
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y10
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y11
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y12
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU Y7, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU Y8, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU Y9, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU Y10, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU Y11, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU Y12, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU Y13, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R12
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x8_64(SB), $0-88
|
|
// Loading 22 of 56 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 66 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulGFNI_7x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 Z22, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 Z23, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 Z24, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 Z25, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 Z26, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 Z27, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 Z28, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 Z29, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R12
|
|
DECQ AX
|
|
JNZ mulGFNI_7x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x8(SB), $0-88
|
|
// Loading 6 of 56 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 66 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_7x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU Y6, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU Y7, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU Y8, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU Y9, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU Y10, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU Y11, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU Y12, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU Y13, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R12
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x8_64Xor(SB), $0-88
|
|
// Loading 22 of 56 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 66 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulGFNI_7x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z22
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z23
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z24
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z25
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z26
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z27
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z28
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 Z22, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 Z23, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 Z24, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 Z25, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 Z26, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 Z27, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 Z28, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 Z29, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R12
|
|
DECQ AX
|
|
JNZ mulGFNI_7x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88
|
|
// Loading 6 of 56 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 66 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_7x8Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU (R13)(R12*1), Y6
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y7
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y8
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y9
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y10
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y11
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y12
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU Y6, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU Y7, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU Y8, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU Y9, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU Y10, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU Y11, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU Y12, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU Y13, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R12
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x9_64(SB), $0-88
|
|
// Loading 21 of 63 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 74 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulGFNI_7x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 Z21, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 Z22, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 Z23, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 Z24, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 Z25, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 Z26, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 Z27, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 Z28, (R13)(R12*1)
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU64 Z29, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R12
|
|
DECQ AX
|
|
JNZ mulGFNI_7x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x9(SB), $0-88
|
|
// Loading 5 of 63 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 74 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_7x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU Y5, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU Y6, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU Y7, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU Y8, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU Y9, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU Y10, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU Y11, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU Y12, (R13)(R12*1)
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU Y13, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R12
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x9_64Xor(SB), $0-88
|
|
// Loading 21 of 63 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 74 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulGFNI_7x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z21
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z22
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z23
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z24
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z25
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z26
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z27
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z28
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 Z21, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 Z22, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 Z23, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 Z24, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 Z25, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 Z26, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 Z27, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 Z28, (R13)(R12*1)
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU64 Z29, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R12
|
|
DECQ AX
|
|
JNZ mulGFNI_7x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88
|
|
// Loading 5 of 63 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 74 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_7x9Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU (R13)(R12*1), Y5
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y6
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y7
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y8
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y9
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y10
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y11
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y12
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU Y5, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU Y6, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU Y7, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU Y8, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU Y9, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU Y10, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU Y11, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU Y12, (R13)(R12*1)
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU Y13, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R12
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x10_64(SB), $0-88
|
|
// Loading 20 of 70 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 82 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulGFNI_7x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 Z20, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 Z21, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 Z22, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 Z23, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 Z24, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 Z25, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 Z26, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 Z27, (R13)(R12*1)
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU64 Z28, (R13)(R12*1)
|
|
MOVQ 216(R11), R13
|
|
VMOVDQU64 Z29, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R12
|
|
DECQ AX
|
|
JNZ mulGFNI_7x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x10(SB), $0-88
|
|
// Loading 4 of 70 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 82 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_7x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU Y4, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU Y5, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU Y6, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU Y7, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU Y8, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU Y9, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU Y10, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU Y11, (R13)(R12*1)
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU Y12, (R13)(R12*1)
|
|
MOVQ 216(R11), R13
|
|
VMOVDQU Y13, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R12
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_7x10_64Xor(SB), $0-88
|
|
// Loading 20 of 70 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 82 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_7x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulGFNI_7x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z20
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z21
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z22
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z23
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z24
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z25
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z26
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z27
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z28
|
|
MOVQ 216(R11), R13
|
|
VMOVDQU64 (R13)(R12*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU64 Z20, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU64 Z21, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU64 Z22, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU64 Z23, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU64 Z24, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU64 Z25, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU64 Z26, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU64 Z27, (R13)(R12*1)
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU64 Z28, (R13)(R12*1)
|
|
MOVQ 216(R11), R13
|
|
VMOVDQU64 Z29, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R12
|
|
DECQ AX
|
|
JNZ mulGFNI_7x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_7x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88
|
|
// Loading 4 of 70 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 82 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_7x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), DX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, DX
|
|
|
|
mulAvxGFNI_7x10Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU (R13)(R12*1), Y4
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y5
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y6
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y7
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y8
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y9
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y10
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y11
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y12
|
|
MOVQ 216(R11), R13
|
|
VMOVDQU (R13)(R12*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R11), R13
|
|
VMOVDQU Y4, (R13)(R12*1)
|
|
MOVQ 24(R11), R13
|
|
VMOVDQU Y5, (R13)(R12*1)
|
|
MOVQ 48(R11), R13
|
|
VMOVDQU Y6, (R13)(R12*1)
|
|
MOVQ 72(R11), R13
|
|
VMOVDQU Y7, (R13)(R12*1)
|
|
MOVQ 96(R11), R13
|
|
VMOVDQU Y8, (R13)(R12*1)
|
|
MOVQ 120(R11), R13
|
|
VMOVDQU Y9, (R13)(R12*1)
|
|
MOVQ 144(R11), R13
|
|
VMOVDQU Y10, (R13)(R12*1)
|
|
MOVQ 168(R11), R13
|
|
VMOVDQU Y11, (R13)(R12*1)
|
|
MOVQ 192(R11), R13
|
|
VMOVDQU Y12, (R13)(R12*1)
|
|
MOVQ 216(R11), R13
|
|
VMOVDQU Y13, (R13)(R12*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R12
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_7x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_7x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 11 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), CX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_8x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z9
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z9
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z9
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z9
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z9
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (R9), Z9
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z5, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 6 to 1 outputs
|
|
VMOVDQU64 (R10), Z9
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z6, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 7 to 1 outputs
|
|
VMOVDQU64 (CX), Z9
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z7, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z8, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_8x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 11 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), CX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, CX
|
|
|
|
mulAvxGFNI_8x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y9
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y9
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y9
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y9
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y9
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (R9), Y9
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y5, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 6 to 1 outputs
|
|
VMOVDQU (R10), Y9
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y6, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 7 to 1 outputs
|
|
VMOVDQU (CX), Y9
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y7, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 11 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), CX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, CX
|
|
|
|
mulGFNI_8x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (R11), Z8
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z9
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z9
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z9
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z9
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z9
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (R9), Z9
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z5, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 6 to 1 outputs
|
|
VMOVDQU64 (R10), Z9
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z6, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Load and process 64 bytes from input 7 to 1 outputs
|
|
VMOVDQU64 (CX), Z9
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z7, Z9, Z9
|
|
VXORPD Z8, Z9, Z8
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z8, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_8x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 11 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), CX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R11
|
|
MOVQ start+72(FP), R12
|
|
|
|
// Add start offset to output
|
|
ADDQ R12, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R12, DX
|
|
ADDQ R12, BX
|
|
ADDQ R12, SI
|
|
ADDQ R12, DI
|
|
ADDQ R12, R8
|
|
ADDQ R12, R9
|
|
ADDQ R12, R10
|
|
ADDQ R12, CX
|
|
|
|
mulAvxGFNI_8x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (R11), Y8
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y9
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y9
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y9
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y9
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y9
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (R9), Y9
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y5, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 6 to 1 outputs
|
|
VMOVDQU (R10), Y9
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y6, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Load and process 32 bytes from input 7 to 1 outputs
|
|
VMOVDQU (CX), Y9
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y7, Y9, Y9
|
|
VXORPD Y8, Y9, Y8
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y8, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), CX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R11
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R12
|
|
ADDQ R13, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_8x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z18
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z18, Z16
|
|
VGF2P8AFFINEQB $0x00, Z1, Z18, Z17
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z18
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z18
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z18
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z18
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (R9), Z18
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 6 to 2 outputs
|
|
VMOVDQU64 (R10), Z18
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 7 to 2 outputs
|
|
VMOVDQU64 (CX), Z18
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z15, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z16, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z17, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_8x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x2(SB), $0-88
|
|
// Loading 12 of 16 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R12
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R13
|
|
ADDQ R14, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_8x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 2 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), CX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R11
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R12
|
|
ADDQ R13, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_8x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (R12), Z16
|
|
VMOVDQU64 (R11), Z17
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z18
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z18
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z18
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z18
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z18
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (R9), Z18
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 6 to 2 outputs
|
|
VMOVDQU64 (R10), Z18
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Load and process 64 bytes from input 7 to 2 outputs
|
|
VMOVDQU64 (CX), Z18
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
|
|
VXORPD Z16, Z19, Z16
|
|
VGF2P8AFFINEQB $0x00, Z15, Z18, Z19
|
|
VXORPD Z17, Z19, Z17
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z16, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z17, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_8x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88
|
|
// Loading 12 of 16 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 20 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R12
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R13
|
|
ADDQ R14, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_8x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (R13), Y12
|
|
VMOVDQU (R12), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 2 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y13, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 29 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), CX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R11
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_8x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z27
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z27, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z27, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z27, Z26
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z27
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z27
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z27
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z27
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (R9), Z27
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 6 to 3 outputs
|
|
VMOVDQU64 (R10), Z27
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z18, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 7 to 3 outputs
|
|
VMOVDQU64 (CX), Z27
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z21, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z22, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z23, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z24, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z25, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z26, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_8x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x3(SB), $0-88
|
|
// Loading 11 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 29 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R12
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_8x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 3 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 29 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), CX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R11
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R12
|
|
ADDQ R14, R13
|
|
ADDQ R14, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_8x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (R12), Z24
|
|
VMOVDQU64 (R13), Z25
|
|
VMOVDQU64 (R11), Z26
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z27
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z27
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z27
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z27
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z27
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (R9), Z27
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 6 to 3 outputs
|
|
VMOVDQU64 (R10), Z27
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z18, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Load and process 64 bytes from input 7 to 3 outputs
|
|
VMOVDQU64 (CX), Z27
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z21, Z27, Z28
|
|
VXORPD Z24, Z28, Z24
|
|
VGF2P8AFFINEQB $0x00, Z22, Z27, Z28
|
|
VXORPD Z25, Z28, Z25
|
|
VGF2P8AFFINEQB $0x00, Z23, Z27, Z28
|
|
VXORPD Z26, Z28, Z26
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z24, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z25, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z26, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_8x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88
|
|
// Loading 11 of 24 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 29 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R12
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_8x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (R13), Y11
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (R12), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 3 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x4_64(SB), $8-88
|
|
// Loading 26 of 32 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R15
|
|
MOVQ 72(R12), R12
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_8x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 4 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 4 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_8x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x4(SB), $8-88
|
|
// Loading 10 of 32 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R15
|
|
MOVQ 72(R12), R12
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_8x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 4 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x4_64Xor(SB), $8-88
|
|
// Loading 26 of 32 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R15
|
|
MOVQ 72(R12), R12
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, DX
|
|
|
|
mulGFNI_8x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R12), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 4 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 4 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_8x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88
|
|
// Loading 10 of 32 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 38 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R15
|
|
MOVQ 72(R12), R12
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_8x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R12), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 4 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x5_64(SB), $8-88
|
|
// Loading 25 of 40 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 47 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), AX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R15
|
|
MOVQ 96(R11), R11
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_8x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 5 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 5 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_8x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x5(SB), $8-88
|
|
// Loading 9 of 40 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 47 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), AX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R15
|
|
MOVQ 96(R11), R11
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_8x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 5 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 5 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_8x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x5_64Xor(SB), $8-88
|
|
// Loading 25 of 40 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 47 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), AX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R15
|
|
MOVQ 96(R11), R11
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_8x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU64 (R12), Z25
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R11), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 5 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 5 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU64 Z25, (R12)
|
|
ADDQ $0x40, R12
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R11)
|
|
ADDQ $0x40, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_8x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88
|
|
// Loading 9 of 40 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 47 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), AX
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ out_base+48(FP), R11
|
|
MOVQ (R11), R12
|
|
MOVQ 24(R11), R13
|
|
MOVQ 48(R11), R14
|
|
MOVQ 72(R11), R15
|
|
MOVQ 96(R11), R11
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R11
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_8x5Xor_loop:
|
|
// Load 5 outputs
|
|
VMOVDQU (R12), Y9
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R11), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 5 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 5 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R11)
|
|
ADDQ $0x20, R11
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_8x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x6_64(SB), $0-88
|
|
// Loading 24 of 48 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 56 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 6 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x6(SB), $0-88
|
|
// Loading 8 of 48 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 56 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 6 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x6_64Xor(SB), $0-88
|
|
// Loading 24 of 48 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 56 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z24
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z25
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z26
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z27
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z28
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 6 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88
|
|
// Loading 8 of 48 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 56 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x6Xor_loop:
|
|
// Load 6 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU (R14)(R13*1), Y8
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y9
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y10
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y11
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y12
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 6 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x7_64(SB), $0-88
|
|
// Loading 23 of 56 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 65 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 7 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z23, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x7(SB), $0-88
|
|
// Loading 7 of 56 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 65 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 7 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y7, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x7_64Xor(SB), $0-88
|
|
// Loading 23 of 56 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 65 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z23
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z24
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z25
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z26
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z27
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z28
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 7 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z23, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88
|
|
// Loading 7 of 56 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 65 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x7Xor_loop:
|
|
// Load 7 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU (R14)(R13*1), Y7
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y8
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y9
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y10
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y11
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y12
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 7 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y7, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x8_64(SB), $0-88
|
|
// Loading 22 of 64 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 74 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 8 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z22, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z23, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x8(SB), $0-88
|
|
// Loading 6 of 64 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 74 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 8 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y6, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y7, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x8_64Xor(SB), $0-88
|
|
// Loading 22 of 64 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 74 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z22
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z23
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z24
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z25
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z26
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z27
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z28
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 8 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z22, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z23, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88
|
|
// Loading 6 of 64 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 74 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x8Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU (R14)(R13*1), Y6
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y7
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y8
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y9
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y10
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y11
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y12
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 8 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y6, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y7, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x9_64(SB), $0-88
|
|
// Loading 21 of 72 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 83 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 9 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z21, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z22, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z23, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x9(SB), $0-88
|
|
// Loading 5 of 72 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 83 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 9 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y5, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y6, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y7, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x9_64Xor(SB), $0-88
|
|
// Loading 21 of 72 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 83 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z21
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z22
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z23
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z24
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z25
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z26
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z27
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z28
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 9 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z21, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z22, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z23, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88
|
|
// Loading 5 of 72 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 83 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x9Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU (R14)(R13*1), Y5
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y6
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y7
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y8
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y9
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y10
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y11
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y12
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 9 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y5, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y6, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y7, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x10_64(SB), $0-88
|
|
// Loading 20 of 80 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 92 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 10 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z20, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z21, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z22, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z23, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 216(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x10(SB), $0-88
|
|
// Loading 4 of 80 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 92 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 10 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y4, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y5, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y6, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y7, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 216(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_8x10_64Xor(SB), $0-88
|
|
// Loading 20 of 80 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 92 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_8x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulGFNI_8x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z20
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z21
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z22
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z23
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z24
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z25
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z26
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z27
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z28
|
|
MOVQ 216(R12), R14
|
|
VMOVDQU64 (R14)(R13*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 10 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU64 Z20, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU64 Z21, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU64 Z22, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU64 Z23, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU64 Z24, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU64 Z25, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU64 Z26, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU64 Z27, (R14)(R13*1)
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU64 Z28, (R14)(R13*1)
|
|
MOVQ 216(R12), R14
|
|
VMOVDQU64 Z29, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R13
|
|
DECQ AX
|
|
JNZ mulGFNI_8x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_8x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88
|
|
// Loading 4 of 80 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 92 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_8x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), DX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, DX
|
|
|
|
mulAvxGFNI_8x10Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU (R14)(R13*1), Y4
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y5
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y6
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y7
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y8
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y9
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y10
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y11
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y12
|
|
MOVQ 216(R12), R14
|
|
VMOVDQU (R14)(R13*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 10 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R12), R14
|
|
VMOVDQU Y4, (R14)(R13*1)
|
|
MOVQ 24(R12), R14
|
|
VMOVDQU Y5, (R14)(R13*1)
|
|
MOVQ 48(R12), R14
|
|
VMOVDQU Y6, (R14)(R13*1)
|
|
MOVQ 72(R12), R14
|
|
VMOVDQU Y7, (R14)(R13*1)
|
|
MOVQ 96(R12), R14
|
|
VMOVDQU Y8, (R14)(R13*1)
|
|
MOVQ 120(R12), R14
|
|
VMOVDQU Y9, (R14)(R13*1)
|
|
MOVQ 144(R12), R14
|
|
VMOVDQU Y10, (R14)(R13*1)
|
|
MOVQ 168(R12), R14
|
|
VMOVDQU Y11, (R14)(R13*1)
|
|
MOVQ 192(R12), R14
|
|
VMOVDQU Y12, (R14)(R13*1)
|
|
MOVQ 216(R12), R14
|
|
VMOVDQU Y13, (R14)(R13*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R13
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_8x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_8x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), CX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_9x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z10
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z10
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z10
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z10
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z10
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (R9), Z10
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z5, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 6 to 1 outputs
|
|
VMOVDQU64 (R10), Z10
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z6, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 7 to 1 outputs
|
|
VMOVDQU64 (R11), Z10
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z7, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 8 to 1 outputs
|
|
VMOVDQU64 (CX), Z10
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z9, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_9x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), CX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, CX
|
|
|
|
mulAvxGFNI_9x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y10
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y10
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y10
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y10
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y10
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (R9), Y10
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y5, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 6 to 1 outputs
|
|
VMOVDQU (R10), Y10
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y6, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 7 to 1 outputs
|
|
VMOVDQU (R11), Y10
|
|
ADDQ $0x20, R11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 8 to 1 outputs
|
|
VMOVDQU (CX), Y10
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y8, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), CX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, CX
|
|
|
|
mulGFNI_9x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (R12), Z9
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z10
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z10
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z10
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z10
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z10
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (R9), Z10
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z5, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 6 to 1 outputs
|
|
VMOVDQU64 (R10), Z10
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z6, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 7 to 1 outputs
|
|
VMOVDQU64 (R11), Z10
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z7, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Load and process 64 bytes from input 8 to 1 outputs
|
|
VMOVDQU64 (CX), Z10
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z8, Z10, Z10
|
|
VXORPD Z9, Z10, Z9
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z9, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_9x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 12 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), CX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R12
|
|
MOVQ start+72(FP), R13
|
|
|
|
// Add start offset to output
|
|
ADDQ R13, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R13, DX
|
|
ADDQ R13, BX
|
|
ADDQ R13, SI
|
|
ADDQ R13, DI
|
|
ADDQ R13, R8
|
|
ADDQ R13, R9
|
|
ADDQ R13, R10
|
|
ADDQ R13, R11
|
|
ADDQ R13, CX
|
|
|
|
mulAvxGFNI_9x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (R12), Y9
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y10
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y10
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y10
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y10
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y10
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (R9), Y10
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y5, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 6 to 1 outputs
|
|
VMOVDQU (R10), Y10
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y6, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 7 to 1 outputs
|
|
VMOVDQU (R11), Y10
|
|
ADDQ $0x20, R11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Load and process 32 bytes from input 8 to 1 outputs
|
|
VMOVDQU (CX), Y10
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y8, Y10, Y10
|
|
VXORPD Y9, Y10, Y9
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y9, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), CX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R12
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R13
|
|
ADDQ R14, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_9x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z20
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z20, Z18
|
|
VGF2P8AFFINEQB $0x00, Z1, Z20, Z19
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z20
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z20
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z20
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z20
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (R9), Z20
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 6 to 2 outputs
|
|
VMOVDQU64 (R10), Z20
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 7 to 2 outputs
|
|
VMOVDQU64 (R11), Z20
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 8 to 2 outputs
|
|
VMOVDQU64 (CX), Z20
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z16, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z17, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z18, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z19, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_9x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x2(SB), $0-88
|
|
// Loading 12 of 18 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R13
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R14
|
|
ADDQ R15, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_9x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 2 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 2 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R13)
|
|
ADDQ $0x20, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), CX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R12
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R13
|
|
ADDQ R14, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_9x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (R13), Z18
|
|
VMOVDQU64 (R12), Z19
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z20
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z20
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z20
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z20
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z20
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (R9), Z20
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 6 to 2 outputs
|
|
VMOVDQU64 (R10), Z20
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 7 to 2 outputs
|
|
VMOVDQU64 (R11), Z20
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Load and process 64 bytes from input 8 to 2 outputs
|
|
VMOVDQU64 (CX), Z20
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z16, Z20, Z21
|
|
VXORPD Z18, Z21, Z18
|
|
VGF2P8AFFINEQB $0x00, Z17, Z20, Z21
|
|
VXORPD Z19, Z21, Z19
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z18, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z19, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_9x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88
|
|
// Loading 12 of 18 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 22 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R13
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R14
|
|
ADDQ R15, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_9x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (R14), Y12
|
|
VMOVDQU (R13), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 2 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 2 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y13, (R13)
|
|
ADDQ $0x20, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x3_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
VBROADCASTF32X2 208(CX), Z26
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), CX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R12
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, DX
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, CX
|
|
|
|
mulGFNI_9x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 3 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 3 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 3 outputs
|
|
VMOVDQU64 (CX), Z30
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_9x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x3(SB), $8-88
|
|
// Loading 11 of 27 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R15
|
|
MOVQ 48(R13), R13
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_9x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 3 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 3 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R13)
|
|
ADDQ $0x20, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x3_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
VBROADCASTF32X2 208(CX), Z26
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), CX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R12
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R13
|
|
ADDQ R15, R14
|
|
ADDQ R15, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, DX
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, CX
|
|
|
|
mulGFNI_9x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (R13), Z27
|
|
VMOVDQU64 (R14), Z28
|
|
VMOVDQU64 (R12), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 3 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 3 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 3 outputs
|
|
VMOVDQU64 (CX), Z30
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z27, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z28, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z29, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_9x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88
|
|
// Loading 11 of 27 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 32 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R15
|
|
MOVQ 48(R13), R13
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_9x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R13), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 3 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 3 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R13)
|
|
ADDQ $0x20, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x4_64(SB), $8-88
|
|
// Loading 26 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), R11
|
|
MOVQ 192(AX), AX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R15
|
|
MOVQ 72(R12), R12
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_9x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 4 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 4 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 4 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_9x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x4(SB), $8-88
|
|
// Loading 10 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), R11
|
|
MOVQ 192(AX), AX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R15
|
|
MOVQ 72(R12), R12
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_9x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 4 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 4 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 4 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_9x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x4_64Xor(SB), $8-88
|
|
// Loading 26 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), R11
|
|
MOVQ 192(AX), AX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R15
|
|
MOVQ 72(R12), R12
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_9x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU64 (R13), Z26
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R12), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 4 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 4 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 4 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU64 Z26, (R13)
|
|
ADDQ $0x40, R13
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R12)
|
|
ADDQ $0x40, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_9x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88
|
|
// Loading 10 of 36 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 42 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), R11
|
|
MOVQ 192(AX), AX
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ out_base+48(FP), R12
|
|
MOVQ (R12), R13
|
|
MOVQ 24(R12), R14
|
|
MOVQ 48(R12), R15
|
|
MOVQ 72(R12), R12
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R13
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R12
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_9x4Xor_loop:
|
|
// Load 4 outputs
|
|
VMOVDQU (R13), Y10
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R12), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 4 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 4 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 4 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R12)
|
|
ADDQ $0x20, R12
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_9x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x5_64(SB), $0-88
|
|
// Loading 25 of 45 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 52 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 5 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 5 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x5(SB), $0-88
|
|
// Loading 9 of 45 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 52 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 5 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 5 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x5_64Xor(SB), $0-88
|
|
// Loading 25 of 45 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 52 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z25
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z26
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z27
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z28
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 5 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 5 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88
|
|
// Loading 9 of 45 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 52 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x5Xor_loop:
|
|
// Load 5 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU (R15)(R14*1), Y9
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y10
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y11
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y12
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 5 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 5 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x6_64(SB), $0-88
|
|
// Loading 24 of 54 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 62 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 6 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 6 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x6(SB), $0-88
|
|
// Loading 8 of 54 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 62 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 6 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 6 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x6_64Xor(SB), $0-88
|
|
// Loading 24 of 54 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 62 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z24
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z25
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z26
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z27
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z28
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 6 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 6 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88
|
|
// Loading 8 of 54 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 62 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x6Xor_loop:
|
|
// Load 6 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU (R15)(R14*1), Y8
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y9
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y10
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y11
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y12
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 6 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 6 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x7_64(SB), $0-88
|
|
// Loading 23 of 63 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 72 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 7 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 7 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z23, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x7(SB), $0-88
|
|
// Loading 7 of 63 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 72 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 7 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 7 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y7, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x7_64Xor(SB), $0-88
|
|
// Loading 23 of 63 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 72 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z23
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z24
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z25
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z26
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z27
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z28
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 7 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 7 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z23, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88
|
|
// Loading 7 of 63 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 72 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x7Xor_loop:
|
|
// Load 7 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU (R15)(R14*1), Y7
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y8
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y9
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y10
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y11
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y12
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 7 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 7 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y7, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x8_64(SB), $0-88
|
|
// Loading 22 of 72 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 82 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 8 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 8 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z22, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z23, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x8(SB), $0-88
|
|
// Loading 6 of 72 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 82 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 8 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 8 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y6, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y7, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x8_64Xor(SB), $0-88
|
|
// Loading 22 of 72 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 82 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z22
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z23
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z24
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z25
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z26
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z27
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z28
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 8 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 8 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z22, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z23, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88
|
|
// Loading 6 of 72 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 82 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x8Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU (R15)(R14*1), Y6
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y7
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y8
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y9
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y10
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y11
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y12
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 8 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 8 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y6, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y7, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x9_64(SB), $0-88
|
|
// Loading 21 of 81 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 92 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 9 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 9 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z21, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z22, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z23, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x9(SB), $0-88
|
|
// Loading 5 of 81 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 92 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 9 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 9 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 640(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y5, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y6, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y7, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x9_64Xor(SB), $0-88
|
|
// Loading 21 of 81 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 92 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z21
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z22
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z23
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z24
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z25
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z26
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z27
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z28
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 9 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 9 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z21, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z22, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z23, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88
|
|
// Loading 5 of 81 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 92 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x9Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU (R15)(R14*1), Y5
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y6
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y7
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y8
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y9
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y10
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y11
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y12
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 9 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 9 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 640(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y5, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y6, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y7, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x10_64(SB), $0-88
|
|
// Loading 20 of 90 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 102 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 10 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 10 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z20, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z21, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z22, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z23, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 216(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x10(SB), $0-88
|
|
// Loading 4 of 90 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 102 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 10 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 10 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 640(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 648(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 656(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 664(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 672(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 680(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 688(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 696(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 704(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 712(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y4, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y5, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y6, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y7, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 216(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_9x10_64Xor(SB), $0-88
|
|
// Loading 20 of 90 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 102 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_9x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulGFNI_9x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z20
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z21
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z22
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z23
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z24
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z25
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z26
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z27
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z28
|
|
MOVQ 216(R13), R15
|
|
VMOVDQU64 (R15)(R14*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 10 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 10 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU64 Z20, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU64 Z21, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU64 Z22, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU64 Z23, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU64 Z24, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU64 Z25, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU64 Z26, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU64 Z27, (R15)(R14*1)
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU64 Z28, (R15)(R14*1)
|
|
MOVQ 216(R13), R15
|
|
VMOVDQU64 Z29, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R14
|
|
DECQ AX
|
|
JNZ mulGFNI_9x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_9x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88
|
|
// Loading 4 of 90 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 102 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_9x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), DX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, DX
|
|
|
|
mulAvxGFNI_9x10Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU (R15)(R14*1), Y4
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y5
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y6
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y7
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y8
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y9
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y10
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y11
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y12
|
|
MOVQ 216(R13), R15
|
|
VMOVDQU (R15)(R14*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 10 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 10 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 640(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 648(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 656(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 664(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 672(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 680(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 688(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 696(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 704(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 712(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R13), R15
|
|
VMOVDQU Y4, (R15)(R14*1)
|
|
MOVQ 24(R13), R15
|
|
VMOVDQU Y5, (R15)(R14*1)
|
|
MOVQ 48(R13), R15
|
|
VMOVDQU Y6, (R15)(R14*1)
|
|
MOVQ 72(R13), R15
|
|
VMOVDQU Y7, (R15)(R14*1)
|
|
MOVQ 96(R13), R15
|
|
VMOVDQU Y8, (R15)(R14*1)
|
|
MOVQ 120(R13), R15
|
|
VMOVDQU Y9, (R15)(R14*1)
|
|
MOVQ 144(R13), R15
|
|
VMOVDQU Y10, (R15)(R14*1)
|
|
MOVQ 168(R13), R15
|
|
VMOVDQU Y11, (R15)(R14*1)
|
|
MOVQ 192(R13), R15
|
|
VMOVDQU Y12, (R15)(R14*1)
|
|
MOVQ 216(R13), R15
|
|
VMOVDQU Y13, (R15)(R14*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R14
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_9x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_9x10Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x1_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 13 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x1_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), R12
|
|
MOVQ 216(CX), CX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_10x1_64_loop:
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z11
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z11
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z11
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z11
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z11
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (R9), Z11
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 6 to 1 outputs
|
|
VMOVDQU64 (R10), Z11
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z6, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 7 to 1 outputs
|
|
VMOVDQU64 (R11), Z11
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z7, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 8 to 1 outputs
|
|
VMOVDQU64 (R12), Z11
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB $0x00, Z8, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 9 to 1 outputs
|
|
VMOVDQU64 (CX), Z11
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z9, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z10, (R13)
|
|
ADDQ $0x40, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_10x1_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x1_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x1(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 13 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x1_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), R12
|
|
MOVQ 216(CX), CX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, CX
|
|
|
|
mulAvxGFNI_10x1_loop:
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y11
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y11
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y11
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y11
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y11
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (R9), Y11
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 6 to 1 outputs
|
|
VMOVDQU (R10), Y11
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y6, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 7 to 1 outputs
|
|
VMOVDQU (R11), Y11
|
|
ADDQ $0x20, R11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 8 to 1 outputs
|
|
VMOVDQU (R12), Y11
|
|
ADDQ $0x20, R12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 9 to 1 outputs
|
|
VMOVDQU (CX), Y11
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y9, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x1_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x1_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x1_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 13 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x1_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), R12
|
|
MOVQ 216(CX), CX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, CX
|
|
|
|
mulGFNI_10x1_64Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU64 (R13), Z10
|
|
|
|
// Load and process 64 bytes from input 0 to 1 outputs
|
|
VMOVDQU64 (DX), Z11
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 1 to 1 outputs
|
|
VMOVDQU64 (BX), Z11
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z1, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 2 to 1 outputs
|
|
VMOVDQU64 (SI), Z11
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z2, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 3 to 1 outputs
|
|
VMOVDQU64 (DI), Z11
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z3, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 4 to 1 outputs
|
|
VMOVDQU64 (R8), Z11
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z4, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 5 to 1 outputs
|
|
VMOVDQU64 (R9), Z11
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 6 to 1 outputs
|
|
VMOVDQU64 (R10), Z11
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z6, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 7 to 1 outputs
|
|
VMOVDQU64 (R11), Z11
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z7, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 8 to 1 outputs
|
|
VMOVDQU64 (R12), Z11
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB $0x00, Z8, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Load and process 64 bytes from input 9 to 1 outputs
|
|
VMOVDQU64 (CX), Z11
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z9, Z11, Z11
|
|
VXORPD Z10, Z11, Z10
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU64 Z10, (R13)
|
|
ADDQ $0x40, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_10x1_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x1_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 13 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x1Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), R12
|
|
MOVQ 216(CX), CX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R13
|
|
MOVQ start+72(FP), R14
|
|
|
|
// Add start offset to output
|
|
ADDQ R14, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R14, DX
|
|
ADDQ R14, BX
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
ADDQ R14, R8
|
|
ADDQ R14, R9
|
|
ADDQ R14, R10
|
|
ADDQ R14, R11
|
|
ADDQ R14, R12
|
|
ADDQ R14, CX
|
|
|
|
mulAvxGFNI_10x1Xor_loop:
|
|
// Load 1 outputs
|
|
VMOVDQU (R13), Y10
|
|
|
|
// Load and process 32 bytes from input 0 to 1 outputs
|
|
VMOVDQU (DX), Y11
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 1 to 1 outputs
|
|
VMOVDQU (BX), Y11
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y1, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 2 to 1 outputs
|
|
VMOVDQU (SI), Y11
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 3 to 1 outputs
|
|
VMOVDQU (DI), Y11
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y3, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 4 to 1 outputs
|
|
VMOVDQU (R8), Y11
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 5 to 1 outputs
|
|
VMOVDQU (R9), Y11
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 6 to 1 outputs
|
|
VMOVDQU (R10), Y11
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y6, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 7 to 1 outputs
|
|
VMOVDQU (R11), Y11
|
|
ADDQ $0x20, R11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 8 to 1 outputs
|
|
VMOVDQU (R12), Y11
|
|
ADDQ $0x20, R12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Load and process 32 bytes from input 9 to 1 outputs
|
|
VMOVDQU (CX), Y11
|
|
ADDQ $0x20, CX
|
|
VGF2P8AFFINEQB $0x00, Y9, Y11, Y11
|
|
VXORPD Y10, Y11, Y10
|
|
|
|
// Store 1 outputs
|
|
VMOVDQU Y10, (R13)
|
|
ADDQ $0x20, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x1Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x1Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x2_64(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 24 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x2_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), R12
|
|
MOVQ 216(CX), CX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R13
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R14
|
|
ADDQ R15, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, DX
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, CX
|
|
|
|
mulGFNI_10x2_64_loop:
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z22
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z22, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z22, Z21
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z22
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z3, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z22
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z5, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z22
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z7, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z22
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z9, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (R9), Z22
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z10, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 6 to 2 outputs
|
|
VMOVDQU64 (R10), Z22
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z12, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z13, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 7 to 2 outputs
|
|
VMOVDQU64 (R11), Z22
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z14, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z15, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 8 to 2 outputs
|
|
VMOVDQU64 (R12), Z22
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB $0x00, Z16, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z17, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 9 to 2 outputs
|
|
VMOVDQU64 (CX), Z22
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z18, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z19, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z20, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z21, (R13)
|
|
ADDQ $0x40, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_10x2_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x2_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x2(SB), $8-88
|
|
// Loading 12 of 20 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 24 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x2_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ (R14), R15
|
|
MOVQ 24(R14), R14
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R15
|
|
ADDQ BP, R14
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_10x2_loop:
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 2 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 2 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 2 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R14)
|
|
ADDQ $0x20, R14
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x2_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x2_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x2_64Xor(SB), $0-88
|
|
// Loading all tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 24 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x2_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), CX
|
|
MOVQ (CX), DX
|
|
MOVQ 24(CX), BX
|
|
MOVQ 48(CX), SI
|
|
MOVQ 72(CX), DI
|
|
MOVQ 96(CX), R8
|
|
MOVQ 120(CX), R9
|
|
MOVQ 144(CX), R10
|
|
MOVQ 168(CX), R11
|
|
MOVQ 192(CX), R12
|
|
MOVQ 216(CX), CX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R13
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to output
|
|
ADDQ R15, R14
|
|
ADDQ R15, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, DX
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, CX
|
|
|
|
mulGFNI_10x2_64Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU64 (R14), Z20
|
|
VMOVDQU64 (R13), Z21
|
|
|
|
// Load and process 64 bytes from input 0 to 2 outputs
|
|
VMOVDQU64 (DX), Z22
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 1 to 2 outputs
|
|
VMOVDQU64 (BX), Z22
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z2, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z3, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 2 to 2 outputs
|
|
VMOVDQU64 (SI), Z22
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z5, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 3 to 2 outputs
|
|
VMOVDQU64 (DI), Z22
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z7, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 4 to 2 outputs
|
|
VMOVDQU64 (R8), Z22
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z8, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z9, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 5 to 2 outputs
|
|
VMOVDQU64 (R9), Z22
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z10, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 6 to 2 outputs
|
|
VMOVDQU64 (R10), Z22
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z12, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z13, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 7 to 2 outputs
|
|
VMOVDQU64 (R11), Z22
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z14, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z15, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 8 to 2 outputs
|
|
VMOVDQU64 (R12), Z22
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB $0x00, Z16, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z17, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Load and process 64 bytes from input 9 to 2 outputs
|
|
VMOVDQU64 (CX), Z22
|
|
ADDQ $0x40, CX
|
|
VGF2P8AFFINEQB $0x00, Z18, Z22, Z23
|
|
VXORPD Z20, Z23, Z20
|
|
VGF2P8AFFINEQB $0x00, Z19, Z22, Z23
|
|
VXORPD Z21, Z23, Z21
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU64 Z20, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z21, (R13)
|
|
ADDQ $0x40, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulGFNI_10x2_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x2_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88
|
|
// Loading 12 of 20 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 24 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x2Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
VBROADCASTSD 88(CX), Y11
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ (R14), R15
|
|
MOVQ 24(R14), R14
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R15
|
|
ADDQ BP, R14
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, R13
|
|
ADDQ BP, DX
|
|
|
|
mulAvxGFNI_10x2Xor_loop:
|
|
// Load 2 outputs
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R14), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 2 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 2 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 2 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 2 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 2 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 2 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 2 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 2 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 2 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 2 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 2 outputs
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R14)
|
|
ADDQ $0x20, R14
|
|
|
|
// Prepare for next loop
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x2Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x2Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x3_64(SB), $8-88
|
|
// Loading 27 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 35 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x3_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
VBROADCASTF32X2 208(CX), Z26
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), R11
|
|
MOVQ 192(AX), R12
|
|
MOVQ 216(AX), AX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R15
|
|
MOVQ 48(R13), R13
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_10x3_64_loop:
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 3 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 3 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 3 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 3 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R13)
|
|
ADDQ $0x40, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_10x3_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x3_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x3(SB), $8-88
|
|
// Loading 11 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 35 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x3_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), R11
|
|
MOVQ 192(AX), R12
|
|
MOVQ 216(AX), AX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R15
|
|
MOVQ 48(R13), R13
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_10x3_loop:
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 3 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 3 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 3 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 3 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R13)
|
|
ADDQ $0x20, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_10x3_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x3_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x3_64Xor(SB), $8-88
|
|
// Loading 27 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 35 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x3_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
VBROADCASTF32X2 208(CX), Z26
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), R11
|
|
MOVQ 192(AX), R12
|
|
MOVQ 216(AX), AX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R15
|
|
MOVQ 48(R13), R13
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x06, BP
|
|
|
|
mulGFNI_10x3_64Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU64 (R14), Z27
|
|
VMOVDQU64 (R15), Z28
|
|
VMOVDQU64 (R13), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 3 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 3 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 3 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 3 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 3 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 3 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 3 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 3 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 3 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 3 outputs
|
|
VMOVDQU64 (AX), Z30
|
|
ADDQ $0x40, AX
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU64 Z27, (R14)
|
|
ADDQ $0x40, R14
|
|
VMOVDQU64 Z28, (R15)
|
|
ADDQ $0x40, R15
|
|
VMOVDQU64 Z29, (R13)
|
|
ADDQ $0x40, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulGFNI_10x3_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x3_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88
|
|
// Loading 11 of 30 tables to registers
|
|
// Destination kept in GP registers
|
|
// Full registers estimated 35 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x3Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
VBROADCASTSD 80(CX), Y10
|
|
MOVQ in_base+24(FP), AX
|
|
MOVQ (AX), DX
|
|
MOVQ 24(AX), BX
|
|
MOVQ 48(AX), SI
|
|
MOVQ 72(AX), DI
|
|
MOVQ 96(AX), R8
|
|
MOVQ 120(AX), R9
|
|
MOVQ 144(AX), R10
|
|
MOVQ 168(AX), R11
|
|
MOVQ 192(AX), R12
|
|
MOVQ 216(AX), AX
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ out_base+48(FP), R13
|
|
MOVQ (R13), R14
|
|
MOVQ 24(R13), R15
|
|
MOVQ 48(R13), R13
|
|
MOVQ start+72(FP), BP
|
|
|
|
// Add start offset to output
|
|
ADDQ BP, R14
|
|
ADDQ BP, R15
|
|
ADDQ BP, R13
|
|
|
|
// Add start offset to input
|
|
ADDQ BP, DX
|
|
ADDQ BP, BX
|
|
ADDQ BP, SI
|
|
ADDQ BP, DI
|
|
ADDQ BP, R8
|
|
ADDQ BP, R9
|
|
ADDQ BP, R10
|
|
ADDQ BP, R11
|
|
ADDQ BP, R12
|
|
ADDQ BP, AX
|
|
|
|
// Reload length to save a register
|
|
MOVQ n+80(FP), BP
|
|
SHRQ $0x05, BP
|
|
|
|
mulAvxGFNI_10x3Xor_loop:
|
|
// Load 3 outputs
|
|
VMOVDQU (R14), Y11
|
|
VMOVDQU (R15), Y12
|
|
VMOVDQU (R13), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 3 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 3 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 3 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 3 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 3 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 3 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 3 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 3 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 3 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 3 outputs
|
|
VMOVDQU (AX), Y14
|
|
ADDQ $0x20, AX
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 3 outputs
|
|
VMOVDQU Y11, (R14)
|
|
ADDQ $0x20, R14
|
|
VMOVDQU Y12, (R15)
|
|
ADDQ $0x20, R15
|
|
VMOVDQU Y13, (R13)
|
|
ADDQ $0x20, R13
|
|
|
|
// Prepare for next loop
|
|
DECQ BP
|
|
JNZ mulAvxGFNI_10x3Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x3Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x4_64(SB), $8-88
|
|
// Loading 26 of 40 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 46 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x4_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x4_64_loop:
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 4 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 4 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 4 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 4 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 4 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x4_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x4_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x4(SB), $8-88
|
|
// Loading 10 of 40 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 46 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x4_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x4_loop:
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 4 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 4 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 4 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x4_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x4_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x4_64Xor(SB), $8-88
|
|
// Loading 26 of 40 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 46 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x4_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
VBROADCASTF32X2 200(CX), Z25
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x4_64Xor_loop:
|
|
// Load 4 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z26
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z27
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z28
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 4 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 4 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 4 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 4 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 4 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 4 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 4 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 4 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 4 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 4 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 4 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x4_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x4_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88
|
|
// Loading 10 of 40 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 46 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x4Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
VBROADCASTSD 72(CX), Y9
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x4Xor_loop:
|
|
// Load 4 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU (BP)(R15*1), Y10
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y11
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y12
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 4 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 4 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 4 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 4 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 4 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 4 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 4 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 4 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 4 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 4 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 4 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x4Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x4Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x5_64(SB), $8-88
|
|
// Loading 25 of 50 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 57 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x5_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x5_64_loop:
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 5 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 5 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 5 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x5_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x5_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x5(SB), $8-88
|
|
// Loading 9 of 50 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 57 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x5_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x5_loop:
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 5 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 5 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 5 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x5_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x5_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x5_64Xor(SB), $8-88
|
|
// Loading 25 of 50 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 57 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x5_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
VBROADCASTF32X2 192(CX), Z24
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x5_64Xor_loop:
|
|
// Load 5 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z25
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z26
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z27
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z28
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 5 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 5 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 5 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 5 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 5 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 5 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 5 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 5 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 5 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 5 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 5 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x5_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x5_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88
|
|
// Loading 9 of 50 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 57 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x5Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
VBROADCASTSD 64(CX), Y8
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x5Xor_loop:
|
|
// Load 5 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU (BP)(R15*1), Y9
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y10
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y11
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y12
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 5 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 5 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 5 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 5 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 5 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 5 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 5 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 5 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 5 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 5 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 5 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x5Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x5Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x6_64(SB), $8-88
|
|
// Loading 24 of 60 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 68 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x6_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x6_64_loop:
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 6 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 6 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 6 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x6_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x6_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x6(SB), $8-88
|
|
// Loading 8 of 60 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 68 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x6_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x6_loop:
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 6 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 6 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 6 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x6_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x6_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x6_64Xor(SB), $8-88
|
|
// Loading 24 of 60 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 68 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x6_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
VBROADCASTF32X2 184(CX), Z23
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x6_64Xor_loop:
|
|
// Load 6 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z24
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z25
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z26
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z27
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z28
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 6 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 6 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 6 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 6 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 6 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 6 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 6 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 6 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 6 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 6 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x6_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x6_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88
|
|
// Loading 8 of 60 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 68 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x6Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
VBROADCASTSD 56(CX), Y7
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x6Xor_loop:
|
|
// Load 6 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU (BP)(R15*1), Y8
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y9
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y10
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y11
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y12
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 6 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 6 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 6 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 6 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 6 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 6 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 6 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 6 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 6 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 6 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 6 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x6Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x6Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x7_64(SB), $8-88
|
|
// Loading 23 of 70 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 79 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x7_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x7_64_loop:
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 7 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 7 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 7 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z23, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x7_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x7_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x7(SB), $8-88
|
|
// Loading 7 of 70 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 79 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x7_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x7_loop:
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 7 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 7 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 7 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y7, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x7_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x7_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x7_64Xor(SB), $8-88
|
|
// Loading 23 of 70 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 79 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x7_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
VBROADCASTF32X2 176(CX), Z22
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x7_64Xor_loop:
|
|
// Load 7 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z23
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z24
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z25
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z26
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z27
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z28
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 7 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 7 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 7 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 7 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 7 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 7 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 7 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 7 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 7 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 7 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z23, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x7_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x7_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88
|
|
// Loading 7 of 70 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 79 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x7Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
VBROADCASTSD 48(CX), Y6
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x7Xor_loop:
|
|
// Load 7 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU (BP)(R15*1), Y7
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y8
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y9
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y10
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y11
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y12
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 7 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 7 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 7 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 7 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 7 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 7 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 7 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 7 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 7 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 7 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 7 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y7, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x7Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x7Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x8_64(SB), $8-88
|
|
// Loading 22 of 80 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 90 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x8_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x8_64_loop:
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 8 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 8 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 8 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z22, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z23, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x8_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x8_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x8(SB), $8-88
|
|
// Loading 6 of 80 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 90 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x8_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x8_loop:
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
|
|
VBROADCASTSD 48(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 56(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 8 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 8 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 8 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y6, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y7, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x8_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x8_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x8_64Xor(SB), $8-88
|
|
// Loading 22 of 80 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 90 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x8_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
VBROADCASTF32X2 168(CX), Z21
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x8_64Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z22
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z23
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z24
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z25
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z26
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z27
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z28
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 8 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 8 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 8 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 8 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 8 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 8 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 8 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 8 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 8 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 8 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z22, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z23, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x8_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x8_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88
|
|
// Loading 6 of 80 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 90 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x8Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
VBROADCASTSD 40(CX), Y5
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x8Xor_loop:
|
|
// Load 8 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU (BP)(R15*1), Y6
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y7
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y8
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y9
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y10
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y11
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y12
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 8 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 8 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 8 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 8 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 8 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 8 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 8 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 8 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 8 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 8 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 8 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y6, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y7, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x8Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x8Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x9_64(SB), $8-88
|
|
// Loading 21 of 90 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 101 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x9_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x9_64_loop:
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 9 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 9 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 9 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z21, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z22, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z23, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x9_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x9_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x9(SB), $8-88
|
|
// Loading 5 of 90 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 101 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x9_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x9_loop:
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
|
|
VBROADCASTSD 40(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 48(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 56(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 64(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 9 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 9 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 9 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 640(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 648(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 656(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 664(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 672(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 680(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 688(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 696(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 704(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 712(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y5, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y6, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y7, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x9_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x9_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x9_64Xor(SB), $8-88
|
|
// Loading 21 of 90 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 101 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x9_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
VBROADCASTF32X2 160(CX), Z20
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x9_64Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z21
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z22
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z23
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z24
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z25
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z26
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z27
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z28
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 9 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 9 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 9 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 9 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 9 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 9 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 9 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 9 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 9 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 9 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z21, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z22, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z23, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x9_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x9_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88
|
|
// Loading 5 of 90 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 101 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x9Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
VBROADCASTSD 32(CX), Y4
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x9Xor_loop:
|
|
// Load 9 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU (BP)(R15*1), Y5
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y6
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y7
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y8
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y9
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y10
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y11
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y12
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 9 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 9 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 9 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 9 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 9 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 9 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 9 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 9 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 9 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 640(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 9 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 648(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 656(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 664(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 672(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 680(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 688(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 696(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 704(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 712(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 9 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y5, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y6, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y7, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x9Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x9Xor_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x10_64(SB), $8-88
|
|
// Loading 20 of 100 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 112 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x10_64_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x10_64_loop:
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 10 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 10 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 10 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z20, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z21, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z22, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z23, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 216(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x10_64_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x10_64_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x10(SB), $8-88
|
|
// Loading 4 of 100 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 112 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x10_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x10_loop:
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
|
|
VBROADCASTSD 32(CX), Y8
|
|
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
|
|
VBROADCASTSD 40(CX), Y9
|
|
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
|
|
VBROADCASTSD 48(CX), Y10
|
|
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
|
|
VBROADCASTSD 56(CX), Y11
|
|
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
|
|
VBROADCASTSD 64(CX), Y12
|
|
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
|
|
VBROADCASTSD 72(CX), Y13
|
|
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 10 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 10 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 10 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 640(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 648(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 656(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 664(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 672(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 680(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 688(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 696(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 704(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 712(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 720(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 728(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 736(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 744(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 752(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 760(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 768(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 776(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 784(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 792(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y4, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y5, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y6, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y7, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 216(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x10_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x10_end:
|
|
RET
|
|
|
|
// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·mulGFNI_10x10_64Xor(SB), $8-88
|
|
// Loading 20 of 100 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 112 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x06, AX
|
|
TESTQ AX, AX
|
|
JZ mulGFNI_10x10_64Xor_end
|
|
VBROADCASTF32X2 (CX), Z0
|
|
VBROADCASTF32X2 8(CX), Z1
|
|
VBROADCASTF32X2 16(CX), Z2
|
|
VBROADCASTF32X2 24(CX), Z3
|
|
VBROADCASTF32X2 32(CX), Z4
|
|
VBROADCASTF32X2 40(CX), Z5
|
|
VBROADCASTF32X2 48(CX), Z6
|
|
VBROADCASTF32X2 56(CX), Z7
|
|
VBROADCASTF32X2 64(CX), Z8
|
|
VBROADCASTF32X2 72(CX), Z9
|
|
VBROADCASTF32X2 80(CX), Z10
|
|
VBROADCASTF32X2 88(CX), Z11
|
|
VBROADCASTF32X2 96(CX), Z12
|
|
VBROADCASTF32X2 104(CX), Z13
|
|
VBROADCASTF32X2 112(CX), Z14
|
|
VBROADCASTF32X2 120(CX), Z15
|
|
VBROADCASTF32X2 128(CX), Z16
|
|
VBROADCASTF32X2 136(CX), Z17
|
|
VBROADCASTF32X2 144(CX), Z18
|
|
VBROADCASTF32X2 152(CX), Z19
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulGFNI_10x10_64Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z20
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z21
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z22
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z23
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z24
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z25
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z26
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z27
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z28
|
|
MOVQ 216(R14), BP
|
|
VMOVDQU64 (BP)(R15*1), Z29
|
|
|
|
// Load and process 64 bytes from input 0 to 10 outputs
|
|
VMOVDQU64 (BX), Z30
|
|
ADDQ $0x40, BX
|
|
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 1 to 10 outputs
|
|
VMOVDQU64 (SI), Z30
|
|
ADDQ $0x40, SI
|
|
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 2 to 10 outputs
|
|
VMOVDQU64 (DI), Z30
|
|
ADDQ $0x40, DI
|
|
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 3 to 10 outputs
|
|
VMOVDQU64 (R8), Z30
|
|
ADDQ $0x40, R8
|
|
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 4 to 10 outputs
|
|
VMOVDQU64 (R9), Z30
|
|
ADDQ $0x40, R9
|
|
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 5 to 10 outputs
|
|
VMOVDQU64 (R10), Z30
|
|
ADDQ $0x40, R10
|
|
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 6 to 10 outputs
|
|
VMOVDQU64 (R11), Z30
|
|
ADDQ $0x40, R11
|
|
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 7 to 10 outputs
|
|
VMOVDQU64 (R12), Z30
|
|
ADDQ $0x40, R12
|
|
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 8 to 10 outputs
|
|
VMOVDQU64 (R13), Z30
|
|
ADDQ $0x40, R13
|
|
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Load and process 64 bytes from input 9 to 10 outputs
|
|
VMOVDQU64 (DX), Z30
|
|
ADDQ $0x40, DX
|
|
VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31
|
|
VXORPD Z20, Z31, Z20
|
|
VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31
|
|
VXORPD Z21, Z31, Z21
|
|
VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31
|
|
VXORPD Z22, Z31, Z22
|
|
VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31
|
|
VXORPD Z23, Z31, Z23
|
|
VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31
|
|
VXORPD Z24, Z31, Z24
|
|
VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31
|
|
VXORPD Z25, Z31, Z25
|
|
VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31
|
|
VXORPD Z26, Z31, Z26
|
|
VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31
|
|
VXORPD Z27, Z31, Z27
|
|
VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31
|
|
VXORPD Z28, Z31, Z28
|
|
VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31
|
|
VXORPD Z29, Z31, Z29
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU64 Z20, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU64 Z21, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU64 Z22, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU64 Z23, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU64 Z24, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU64 Z25, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU64 Z26, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU64 Z27, (BP)(R15*1)
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU64 Z28, (BP)(R15*1)
|
|
MOVQ 216(R14), BP
|
|
VMOVDQU64 Z29, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x40, R15
|
|
DECQ AX
|
|
JNZ mulGFNI_10x10_64Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulGFNI_10x10_64Xor_end:
|
|
RET
|
|
|
|
// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
|
|
// Requires: AVX, GFNI
|
|
TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88
|
|
// Loading 4 of 100 tables to registers
|
|
// Destination kept on stack
|
|
// Full registers estimated 112 YMM used
|
|
MOVQ n+80(FP), AX
|
|
MOVQ matrix_base+0(FP), CX
|
|
SHRQ $0x05, AX
|
|
TESTQ AX, AX
|
|
JZ mulAvxGFNI_10x10Xor_end
|
|
VBROADCASTSD (CX), Y0
|
|
VBROADCASTSD 8(CX), Y1
|
|
VBROADCASTSD 16(CX), Y2
|
|
VBROADCASTSD 24(CX), Y3
|
|
MOVQ in_base+24(FP), DX
|
|
MOVQ (DX), BX
|
|
MOVQ 24(DX), SI
|
|
MOVQ 48(DX), DI
|
|
MOVQ 72(DX), R8
|
|
MOVQ 96(DX), R9
|
|
MOVQ 120(DX), R10
|
|
MOVQ 144(DX), R11
|
|
MOVQ 168(DX), R12
|
|
MOVQ 192(DX), R13
|
|
MOVQ 216(DX), DX
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ out_base+48(FP), R14
|
|
MOVQ start+72(FP), R15
|
|
|
|
// Add start offset to input
|
|
ADDQ R15, BX
|
|
ADDQ R15, SI
|
|
ADDQ R15, DI
|
|
ADDQ R15, R8
|
|
ADDQ R15, R9
|
|
ADDQ R15, R10
|
|
ADDQ R15, R11
|
|
ADDQ R15, R12
|
|
ADDQ R15, R13
|
|
ADDQ R15, DX
|
|
|
|
mulAvxGFNI_10x10Xor_loop:
|
|
// Load 10 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU (BP)(R15*1), Y4
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y5
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y6
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y7
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y8
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y9
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y10
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y11
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y12
|
|
MOVQ 216(R14), BP
|
|
VMOVDQU (BP)(R15*1), Y13
|
|
|
|
// Load and process 32 bytes from input 0 to 10 outputs
|
|
VMOVDQU (BX), Y14
|
|
ADDQ $0x20, BX
|
|
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 32(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 40(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 48(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 56(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 64(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 72(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 1 to 10 outputs
|
|
VMOVDQU (SI), Y14
|
|
ADDQ $0x20, SI
|
|
VBROADCASTSD 80(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 88(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 96(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 104(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 112(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 120(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 128(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 136(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 144(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 152(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 2 to 10 outputs
|
|
VMOVDQU (DI), Y14
|
|
ADDQ $0x20, DI
|
|
VBROADCASTSD 160(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 168(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 176(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 184(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 192(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 200(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 208(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 216(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 224(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 232(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 3 to 10 outputs
|
|
VMOVDQU (R8), Y14
|
|
ADDQ $0x20, R8
|
|
VBROADCASTSD 240(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 248(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 256(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 264(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 272(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 280(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 288(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 296(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 304(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 312(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 4 to 10 outputs
|
|
VMOVDQU (R9), Y14
|
|
ADDQ $0x20, R9
|
|
VBROADCASTSD 320(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 328(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 336(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 344(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 352(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 360(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 368(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 376(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 384(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 392(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 5 to 10 outputs
|
|
VMOVDQU (R10), Y14
|
|
ADDQ $0x20, R10
|
|
VBROADCASTSD 400(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 408(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 416(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 424(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 432(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 440(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 448(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 456(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 464(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 472(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 6 to 10 outputs
|
|
VMOVDQU (R11), Y14
|
|
ADDQ $0x20, R11
|
|
VBROADCASTSD 480(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 488(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 496(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 504(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 512(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 520(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 528(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 536(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 544(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 552(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 7 to 10 outputs
|
|
VMOVDQU (R12), Y14
|
|
ADDQ $0x20, R12
|
|
VBROADCASTSD 560(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 568(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 576(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 584(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 592(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 600(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 608(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 616(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 624(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 632(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 8 to 10 outputs
|
|
VMOVDQU (R13), Y14
|
|
ADDQ $0x20, R13
|
|
VBROADCASTSD 640(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 648(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 656(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 664(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 672(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 680(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 688(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 696(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 704(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 712(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Load and process 32 bytes from input 9 to 10 outputs
|
|
VMOVDQU (DX), Y14
|
|
ADDQ $0x20, DX
|
|
VBROADCASTSD 720(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y4, Y15, Y4
|
|
VBROADCASTSD 728(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y5, Y15, Y5
|
|
VBROADCASTSD 736(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y6, Y15, Y6
|
|
VBROADCASTSD 744(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y7, Y15, Y7
|
|
VBROADCASTSD 752(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y8, Y15, Y8
|
|
VBROADCASTSD 760(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y9, Y15, Y9
|
|
VBROADCASTSD 768(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y10, Y15, Y10
|
|
VBROADCASTSD 776(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y11, Y15, Y11
|
|
VBROADCASTSD 784(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y12, Y15, Y12
|
|
VBROADCASTSD 792(CX), Y15
|
|
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
|
|
VXORPD Y13, Y15, Y13
|
|
|
|
// Store 10 outputs
|
|
MOVQ (R14), BP
|
|
VMOVDQU Y4, (BP)(R15*1)
|
|
MOVQ 24(R14), BP
|
|
VMOVDQU Y5, (BP)(R15*1)
|
|
MOVQ 48(R14), BP
|
|
VMOVDQU Y6, (BP)(R15*1)
|
|
MOVQ 72(R14), BP
|
|
VMOVDQU Y7, (BP)(R15*1)
|
|
MOVQ 96(R14), BP
|
|
VMOVDQU Y8, (BP)(R15*1)
|
|
MOVQ 120(R14), BP
|
|
VMOVDQU Y9, (BP)(R15*1)
|
|
MOVQ 144(R14), BP
|
|
VMOVDQU Y10, (BP)(R15*1)
|
|
MOVQ 168(R14), BP
|
|
VMOVDQU Y11, (BP)(R15*1)
|
|
MOVQ 192(R14), BP
|
|
VMOVDQU Y12, (BP)(R15*1)
|
|
MOVQ 216(R14), BP
|
|
VMOVDQU Y13, (BP)(R15*1)
|
|
|
|
// Prepare for next loop
|
|
ADDQ $0x20, R15
|
|
DECQ AX
|
|
JNZ mulAvxGFNI_10x10Xor_loop
|
|
VZEROUPPER
|
|
|
|
mulAvxGFNI_10x10Xor_end:
|
|
RET
|
|
|
|
// func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t01+32(FP), Z0
|
|
VBROADCASTF32X2 t23+40(FP), Z1
|
|
VBROADCASTF32X2 t02+48(FP), Z2
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z3
|
|
VMOVDQU64 (DI), Z4
|
|
VMOVDQU64 (R8), Z5
|
|
VMOVDQU64 (AX), Z6
|
|
VXORPD Z4, Z3, Z4
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z7
|
|
VXORPD Z3, Z7, Z3
|
|
VXORPD Z5, Z6, Z6
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
|
|
VPTERNLOGD $0x96, Z7, Z3, Z5
|
|
VXORPD Z4, Z6, Z6
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z2, Z5, Z7
|
|
VXORPD Z3, Z7, Z3
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
|
|
VXORPD Z4, Z7, Z4
|
|
VMOVDQU64 Z3, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z4, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z5, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z6, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t01+32(FP), Z0
|
|
VBROADCASTF32X2 t23+40(FP), Z1
|
|
VBROADCASTF32X2 t02+48(FP), Z2
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z3
|
|
VMOVDQU64 (DI), Z4
|
|
VMOVDQU64 (R8), Z5
|
|
VMOVDQU64 (AX), Z6
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z2, Z5, Z7
|
|
VXORPD Z3, Z7, Z3
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
|
|
VXORPD Z4, Z7, Z4
|
|
VXORPD Z3, Z5, Z5
|
|
VXORPD Z4, Z6, Z6
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z7
|
|
VXORPD Z3, Z7, Z3
|
|
VXORPD Z4, Z3, Z4
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
|
|
VXORPD Z5, Z7, Z5
|
|
VXORPD Z5, Z6, Z6
|
|
VMOVDQU64 Z3, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z4, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z5, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z6, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t23+40(FP), Z0
|
|
VBROADCASTF32X2 t02+48(FP), Z1
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z2
|
|
VMOVDQU64 (DI), Z3
|
|
VMOVDQU64 (R8), Z4
|
|
VMOVDQU64 (AX), Z5
|
|
VXORPD Z3, Z2, Z3
|
|
VXORPD Z4, Z5, Z5
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z5, Z6
|
|
VPTERNLOGD $0x96, Z6, Z2, Z4
|
|
VXORPD Z3, Z5, Z5
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
|
|
VXORPD Z2, Z6, Z2
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
|
|
VXORPD Z3, Z6, Z3
|
|
VMOVDQU64 Z2, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z3, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z4, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z5, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t01+32(FP), Z0
|
|
VBROADCASTF32X2 t23+40(FP), Z1
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z2
|
|
VMOVDQU64 (DI), Z3
|
|
VMOVDQU64 (R8), Z4
|
|
VMOVDQU64 (AX), Z5
|
|
VXORPD Z2, Z4, Z4
|
|
VXORPD Z3, Z5, Z5
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
|
|
VXORPD Z2, Z6, Z2
|
|
VXORPD Z3, Z2, Z3
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
|
|
VXORPD Z4, Z6, Z4
|
|
VXORPD Z4, Z5, Z5
|
|
VMOVDQU64 Z2, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z3, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z4, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z5, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t01+32(FP), Z0
|
|
VBROADCASTF32X2 t02+48(FP), Z1
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z2
|
|
VMOVDQU64 (DI), Z3
|
|
VMOVDQU64 (R8), Z4
|
|
VMOVDQU64 (AX), Z5
|
|
VXORPD Z3, Z2, Z3
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
|
|
VXORPD Z2, Z6, Z2
|
|
VXORPD Z4, Z5, Z5
|
|
VXORPD Z2, Z4, Z4
|
|
VXORPD Z3, Z5, Z5
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
|
|
VXORPD Z2, Z6, Z2
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
|
|
VXORPD Z3, Z6, Z3
|
|
VMOVDQU64 Z2, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z3, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z4, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z5, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t23+40(FP), Z0
|
|
VBROADCASTF32X2 t02+48(FP), Z1
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z2
|
|
VMOVDQU64 (DI), Z3
|
|
VMOVDQU64 (R8), Z4
|
|
VMOVDQU64 (AX), Z5
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
|
|
VXORPD Z2, Z6, Z2
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
|
|
VXORPD Z3, Z6, Z3
|
|
VXORPD Z2, Z4, Z4
|
|
VXORPD Z3, Z5, Z5
|
|
VXORPD Z3, Z2, Z3
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z5, Z6
|
|
VXORPD Z4, Z6, Z4
|
|
VXORPD Z4, Z5, Z5
|
|
VMOVDQU64 Z2, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z3, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z4, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z5, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t02+48(FP), Z0
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z1
|
|
VMOVDQU64 (DI), Z2
|
|
VMOVDQU64 (R8), Z3
|
|
VMOVDQU64 (AX), Z4
|
|
VXORPD Z2, Z1, Z2
|
|
VXORPD Z3, Z4, Z4
|
|
VXORPD Z1, Z3, Z3
|
|
VXORPD Z2, Z4, Z4
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z5
|
|
VXORPD Z1, Z5, Z1
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
|
|
VXORPD Z2, Z5, Z2
|
|
VMOVDQU64 Z1, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z2, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z3, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z4, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t23+40(FP), Z0
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z1
|
|
VMOVDQU64 (DI), Z2
|
|
VMOVDQU64 (R8), Z3
|
|
VMOVDQU64 (AX), Z4
|
|
VXORPD Z1, Z3, Z3
|
|
VXORPD Z2, Z4, Z4
|
|
VXORPD Z2, Z1, Z2
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
|
|
VXORPD Z3, Z5, Z3
|
|
VXORPD Z3, Z4, Z4
|
|
VMOVDQU64 Z1, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z2, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z3, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z4, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t01+32(FP), Z0
|
|
VBROADCASTF32X2 t23+40(FP), Z1
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z2
|
|
VMOVDQU64 (DI), Z3
|
|
VMOVDQU64 (R8), Z4
|
|
VMOVDQU64 (AX), Z5
|
|
VXORPD Z3, Z2, Z3
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
|
|
VXORPD Z2, Z6, Z2
|
|
VXORPD Z4, Z5, Z5
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
|
|
VPTERNLOGD $0x96, Z6, Z2, Z4
|
|
VXORPD Z3, Z5, Z5
|
|
VMOVDQU64 Z2, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z3, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z4, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z5, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t01+32(FP), Z0
|
|
VBROADCASTF32X2 t02+48(FP), Z1
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z2
|
|
VMOVDQU64 (DI), Z3
|
|
VMOVDQU64 (R8), Z4
|
|
VMOVDQU64 (AX), Z5
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
|
|
VXORPD Z2, Z6, Z2
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
|
|
VXORPD Z3, Z6, Z3
|
|
VXORPD Z2, Z4, Z4
|
|
VXORPD Z3, Z5, Z5
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
|
|
VXORPD Z2, Z6, Z2
|
|
VXORPD Z3, Z2, Z3
|
|
VXORPD Z4, Z5, Z5
|
|
VMOVDQU64 Z2, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z3, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z4, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z5, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t23+40(FP), Z0
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z1
|
|
VMOVDQU64 (DI), Z2
|
|
VMOVDQU64 (R8), Z3
|
|
VMOVDQU64 (AX), Z4
|
|
VXORPD Z2, Z1, Z2
|
|
VXORPD Z3, Z4, Z4
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
|
|
VPTERNLOGD $0x96, Z5, Z1, Z3
|
|
VXORPD Z2, Z4, Z4
|
|
VMOVDQU64 Z1, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z2, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z3, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z4, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t01+32(FP), Z0
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z1
|
|
VMOVDQU64 (DI), Z2
|
|
VMOVDQU64 (R8), Z3
|
|
VMOVDQU64 (AX), Z4
|
|
VXORPD Z1, Z3, Z3
|
|
VXORPD Z2, Z4, Z4
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z2, Z5
|
|
VXORPD Z1, Z5, Z1
|
|
VXORPD Z2, Z1, Z2
|
|
VXORPD Z3, Z4, Z4
|
|
VMOVDQU64 Z1, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z2, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z3, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z4, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t01+32(FP), Z0
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z1
|
|
VMOVDQU64 (DI), Z2
|
|
VMOVDQU64 (R8), Z3
|
|
VMOVDQU64 (AX), Z4
|
|
VXORPD Z2, Z1, Z2
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z2, Z5
|
|
VXORPD Z1, Z5, Z1
|
|
VXORPD Z3, Z4, Z4
|
|
VXORPD Z1, Z3, Z3
|
|
VXORPD Z2, Z4, Z4
|
|
VMOVDQU64 Z1, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z2, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z3, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z4, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F, GFNI
|
|
TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56
|
|
VBROADCASTF32X2 t02+48(FP), Z0
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z1
|
|
VMOVDQU64 (DI), Z2
|
|
VMOVDQU64 (R8), Z3
|
|
VMOVDQU64 (AX), Z4
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z3, Z5
|
|
VXORPD Z1, Z5, Z1
|
|
|
|
// LEO_MULADD_512
|
|
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
|
|
VXORPD Z2, Z5, Z2
|
|
VXORPD Z1, Z3, Z3
|
|
VXORPD Z2, Z4, Z4
|
|
VXORPD Z2, Z1, Z2
|
|
VXORPD Z3, Z4, Z4
|
|
VMOVDQU64 Z1, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z2, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z3, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z4, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F
|
|
TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z0
|
|
VMOVDQU64 (DI), Z1
|
|
VMOVDQU64 (R8), Z2
|
|
VMOVDQU64 (AX), Z3
|
|
VXORPD Z1, Z0, Z1
|
|
VXORPD Z2, Z3, Z3
|
|
VXORPD Z0, Z2, Z2
|
|
VXORPD Z1, Z3, Z3
|
|
VMOVDQU64 Z0, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z1, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z2, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z3, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
|
|
// Requires: AVX, AVX512DQ, AVX512F
|
|
TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56
|
|
MOVQ dist+24(FP), AX
|
|
MOVQ work_base+0(FP), CX
|
|
MOVQ 8(CX), DX
|
|
XORQ BX, BX
|
|
MOVQ (CX)(BX*1), SI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), DI
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), R8
|
|
ADDQ AX, BX
|
|
MOVQ (CX)(BX*1), AX
|
|
|
|
loop:
|
|
VMOVDQU64 (SI), Z0
|
|
VMOVDQU64 (DI), Z1
|
|
VMOVDQU64 (R8), Z2
|
|
VMOVDQU64 (AX), Z3
|
|
VXORPD Z0, Z2, Z2
|
|
VXORPD Z1, Z3, Z3
|
|
VXORPD Z1, Z0, Z1
|
|
VXORPD Z2, Z3, Z3
|
|
VMOVDQU64 Z0, (SI)
|
|
ADDQ $0x40, SI
|
|
VMOVDQU64 Z1, (DI)
|
|
ADDQ $0x40, DI
|
|
VMOVDQU64 Z2, (R8)
|
|
ADDQ $0x40, R8
|
|
VMOVDQU64 Z3, (AX)
|
|
ADDQ $0x40, AX
|
|
SUBQ $0x40, DX
|
|
JA loop
|
|
VZEROUPPER
|
|
RET
|