status-go/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s
2024-04-17 22:05:53 +02:00

67988 lines
1.8 MiB

// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT.
//go:build !appengine && !noasm && !nogen && nopshufb && gc
#include "textflag.h"
// func _dummy_()
TEXT ·_dummy_(SB), $0
#ifdef GOAMD64_v4
#define XOR3WAY(ignore, a, b, dst) \
VPTERNLOGD $0x96, a, b, dst
#else
#define XOR3WAY(ignore, a, b, dst) \
VPXOR a, dst, dst \
VPXOR b, dst, dst
#endif
RET
// sSE2XorSlice will XOR in with out and store in out.
// Processes 16 bytes/loop.
// func sSE2XorSlice(in []byte, out []byte)
// Requires: SSE2
TEXT ·sSE2XorSlice(SB), $0-48
MOVQ in_base+0(FP), AX
MOVQ out_base+24(FP), CX
MOVQ in_len+8(FP), DX
SHRQ $0x04, DX
JZ end
loop:
MOVOU (AX), X0
MOVOU (CX), X1
PXOR X0, X1
MOVOU X1, (CX)
ADDQ $0x10, AX
ADDQ $0x10, CX
DECQ DX
JNZ loop
end:
RET
// sSE2XorSlice_64 will XOR in with out and store in out.
// Processes 64 bytes/loop.
// func sSE2XorSlice_64(in []byte, out []byte)
// Requires: SSE2
TEXT ·sSE2XorSlice_64(SB), $0-48
MOVQ in_base+0(FP), AX
MOVQ out_base+24(FP), CX
MOVQ in_len+8(FP), DX
SHRQ $0x06, DX
JZ end
loop:
MOVOU (AX), X0
MOVOU 16(AX), X2
MOVOU 32(AX), X4
MOVOU 48(AX), X6
MOVOU (CX), X1
MOVOU 16(CX), X3
MOVOU 32(CX), X5
MOVOU 48(CX), X7
PXOR X0, X1
PXOR X2, X3
PXOR X4, X5
PXOR X6, X7
MOVOU X1, (CX)
MOVOU X3, 16(CX)
MOVOU X5, 32(CX)
MOVOU X7, 48(CX)
ADDQ $0x40, AX
ADDQ $0x40, CX
DECQ DX
JNZ loop
end:
RET
// avx2XorSlice_64 will XOR in with out and store in out.
// Processes 64 bytes/loop.
// func avx2XorSlice_64(in []byte, out []byte)
// Requires: AVX, AVX2
TEXT ·avx2XorSlice_64(SB), $0-48
MOVQ in_base+0(FP), AX
MOVQ out_base+24(FP), CX
MOVQ in_len+8(FP), DX
SHRQ $0x06, DX
JZ end
loop:
VMOVDQU (AX), Y0
VMOVDQU 32(AX), Y2
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y3
VPXOR Y0, Y1, Y1
VPXOR Y2, Y3, Y3
VMOVDQU Y1, (CX)
VMOVDQU Y3, 32(CX)
ADDQ $0x40, AX
ADDQ $0x40, CX
DECQ DX
JNZ loop
end:
VZEROUPPER
RET
// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 4 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x1_64_end
VBROADCASTF32X2 (CX), Z0
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), DX
MOVQ start+72(FP), BX
// Add start offset to output
ADDQ BX, DX
// Add start offset to input
ADDQ BX, CX
mulGFNI_1x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (CX), Z1
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z1, Z1
// Store 1 outputs
VMOVDQU64 Z1, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x1_64_loop
VZEROUPPER
mulGFNI_1x1_64_end:
RET
// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 4 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x1_end
VBROADCASTSD (CX), Y0
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), DX
MOVQ start+72(FP), BX
// Add start offset to output
ADDQ BX, DX
// Add start offset to input
ADDQ BX, CX
mulAvxGFNI_1x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (CX), Y1
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y1, Y1
// Store 1 outputs
VMOVDQU Y1, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x1_loop
VZEROUPPER
mulAvxGFNI_1x1_end:
RET
// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 4 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), DX
MOVQ start+72(FP), BX
// Add start offset to output
ADDQ BX, DX
// Add start offset to input
ADDQ BX, CX
mulGFNI_1x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (DX), Z1
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (CX), Z2
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z2, Z2
VXORPD Z1, Z2, Z1
// Store 1 outputs
VMOVDQU64 Z1, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x1_64Xor_loop
VZEROUPPER
mulGFNI_1x1_64Xor_end:
RET
// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 4 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x1Xor_end
VBROADCASTSD (CX), Y0
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), DX
MOVQ start+72(FP), BX
// Add start offset to output
ADDQ BX, DX
// Add start offset to input
ADDQ BX, CX
mulAvxGFNI_1x1Xor_loop:
// Load 1 outputs
VMOVDQU (DX), Y1
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (CX), Y2
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y2, Y2
VXORPD Y1, Y2, Y1
// Store 1 outputs
VMOVDQU Y1, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x1Xor_loop
VZEROUPPER
mulAvxGFNI_1x1Xor_end:
RET
// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 6 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ start+72(FP), SI
// Add start offset to output
ADDQ SI, BX
ADDQ SI, DX
// Add start offset to input
ADDQ SI, CX
mulGFNI_1x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (CX), Z3
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z3, Z2
VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
// Store 2 outputs
VMOVDQU64 Z2, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z3, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x2_64_loop
VZEROUPPER
mulGFNI_1x2_64_end:
RET
// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x2(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 6 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ start+72(FP), SI
// Add start offset to output
ADDQ SI, BX
ADDQ SI, DX
// Add start offset to input
ADDQ SI, CX
mulAvxGFNI_1x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (CX), Y3
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y3, Y2
VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
// Store 2 outputs
VMOVDQU Y2, (BX)
ADDQ $0x20, BX
VMOVDQU Y3, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x2_loop
VZEROUPPER
mulAvxGFNI_1x2_end:
RET
// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 6 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ start+72(FP), SI
// Add start offset to output
ADDQ SI, BX
ADDQ SI, DX
// Add start offset to input
ADDQ SI, CX
mulGFNI_1x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (BX), Z2
VMOVDQU64 (DX), Z3
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (CX), Z4
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
VXORPD Z2, Z5, Z2
VGF2P8AFFINEQB $0x00, Z1, Z4, Z5
VXORPD Z3, Z5, Z3
// Store 2 outputs
VMOVDQU64 Z2, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z3, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x2_64Xor_loop
VZEROUPPER
mulGFNI_1x2_64Xor_end:
RET
// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 6 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ start+72(FP), SI
// Add start offset to output
ADDQ SI, BX
ADDQ SI, DX
// Add start offset to input
ADDQ SI, CX
mulAvxGFNI_1x2Xor_loop:
// Load 2 outputs
VMOVDQU (BX), Y2
VMOVDQU (DX), Y3
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (CX), Y4
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y4, Y5
VXORPD Y2, Y5, Y2
VGF2P8AFFINEQB $0x00, Y1, Y4, Y5
VXORPD Y3, Y5, Y3
// Store 2 outputs
VMOVDQU Y2, (BX)
ADDQ $0x20, BX
VMOVDQU Y3, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x2Xor_loop
VZEROUPPER
mulAvxGFNI_1x2Xor_end:
RET
// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, BX
ADDQ DI, SI
ADDQ DI, DX
// Add start offset to input
ADDQ DI, CX
mulGFNI_1x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (CX), Z5
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z5, Z3
VGF2P8AFFINEQB $0x00, Z1, Z5, Z4
VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
// Store 3 outputs
VMOVDQU64 Z3, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z4, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z5, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x3_64_loop
VZEROUPPER
mulGFNI_1x3_64_end:
RET
// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x3(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, BX
ADDQ DI, SI
ADDQ DI, DX
// Add start offset to input
ADDQ DI, CX
mulAvxGFNI_1x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (CX), Y5
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y5, Y3
VGF2P8AFFINEQB $0x00, Y1, Y5, Y4
VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
// Store 3 outputs
VMOVDQU Y3, (BX)
ADDQ $0x20, BX
VMOVDQU Y4, (SI)
ADDQ $0x20, SI
VMOVDQU Y5, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x3_loop
VZEROUPPER
mulAvxGFNI_1x3_end:
RET
// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, BX
ADDQ DI, SI
ADDQ DI, DX
// Add start offset to input
ADDQ DI, CX
mulGFNI_1x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (BX), Z3
VMOVDQU64 (SI), Z4
VMOVDQU64 (DX), Z5
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (CX), Z6
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z6, Z7
VXORPD Z3, Z7, Z3
VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
VXORPD Z4, Z7, Z4
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
VXORPD Z5, Z7, Z5
// Store 3 outputs
VMOVDQU64 Z3, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z4, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z5, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x3_64Xor_loop
VZEROUPPER
mulGFNI_1x3_64Xor_end:
RET
// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, BX
ADDQ DI, SI
ADDQ DI, DX
// Add start offset to input
ADDQ DI, CX
mulAvxGFNI_1x3Xor_loop:
// Load 3 outputs
VMOVDQU (BX), Y3
VMOVDQU (SI), Y4
VMOVDQU (DX), Y5
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (CX), Y6
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y6, Y7
VXORPD Y3, Y7, Y3
VGF2P8AFFINEQB $0x00, Y1, Y6, Y7
VXORPD Y4, Y7, Y4
VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
VXORPD Y5, Y7, Y5
// Store 3 outputs
VMOVDQU Y3, (BX)
ADDQ $0x20, BX
VMOVDQU Y4, (SI)
ADDQ $0x20, SI
VMOVDQU Y5, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x3Xor_loop
VZEROUPPER
mulAvxGFNI_1x3Xor_end:
RET
// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x4_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, BX
ADDQ R8, SI
ADDQ R8, DI
ADDQ R8, DX
// Add start offset to input
ADDQ R8, CX
mulGFNI_1x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (CX), Z7
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z7, Z4
VGF2P8AFFINEQB $0x00, Z1, Z7, Z5
VGF2P8AFFINEQB $0x00, Z2, Z7, Z6
VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
// Store 4 outputs
VMOVDQU64 Z4, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z5, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z6, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z7, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x4_64_loop
VZEROUPPER
mulGFNI_1x4_64_end:
RET
// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x4(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, BX
ADDQ R8, SI
ADDQ R8, DI
ADDQ R8, DX
// Add start offset to input
ADDQ R8, CX
mulAvxGFNI_1x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (CX), Y7
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y7, Y4
VGF2P8AFFINEQB $0x00, Y1, Y7, Y5
VGF2P8AFFINEQB $0x00, Y2, Y7, Y6
VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
// Store 4 outputs
VMOVDQU Y4, (BX)
ADDQ $0x20, BX
VMOVDQU Y5, (SI)
ADDQ $0x20, SI
VMOVDQU Y6, (DI)
ADDQ $0x20, DI
VMOVDQU Y7, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x4_loop
VZEROUPPER
mulAvxGFNI_1x4_end:
RET
// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x4_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, BX
ADDQ R8, SI
ADDQ R8, DI
ADDQ R8, DX
// Add start offset to input
ADDQ R8, CX
mulGFNI_1x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (BX), Z4
VMOVDQU64 (SI), Z5
VMOVDQU64 (DI), Z6
VMOVDQU64 (DX), Z7
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (CX), Z8
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z8, Z9
VXORPD Z4, Z9, Z4
VGF2P8AFFINEQB $0x00, Z1, Z8, Z9
VXORPD Z5, Z9, Z5
VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
VXORPD Z6, Z9, Z6
VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
VXORPD Z7, Z9, Z7
// Store 4 outputs
VMOVDQU64 Z4, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z5, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z6, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z7, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x4_64Xor_loop
VZEROUPPER
mulGFNI_1x4_64Xor_end:
RET
// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, BX
ADDQ R8, SI
ADDQ R8, DI
ADDQ R8, DX
// Add start offset to input
ADDQ R8, CX
mulAvxGFNI_1x4Xor_loop:
// Load 4 outputs
VMOVDQU (BX), Y4
VMOVDQU (SI), Y5
VMOVDQU (DI), Y6
VMOVDQU (DX), Y7
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (CX), Y8
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y8, Y9
VXORPD Y4, Y9, Y4
VGF2P8AFFINEQB $0x00, Y1, Y8, Y9
VXORPD Y5, Y9, Y5
VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
VXORPD Y6, Y9, Y6
VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
VXORPD Y7, Y9, Y7
// Store 4 outputs
VMOVDQU Y4, (BX)
ADDQ $0x20, BX
VMOVDQU Y5, (SI)
ADDQ $0x20, SI
VMOVDQU Y6, (DI)
ADDQ $0x20, DI
VMOVDQU Y7, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x4Xor_loop
VZEROUPPER
mulAvxGFNI_1x4Xor_end:
RET
// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x5_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, DX
// Add start offset to input
ADDQ R9, CX
mulGFNI_1x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (CX), Z9
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z9, Z5
VGF2P8AFFINEQB $0x00, Z1, Z9, Z6
VGF2P8AFFINEQB $0x00, Z2, Z9, Z7
VGF2P8AFFINEQB $0x00, Z3, Z9, Z8
VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
// Store 5 outputs
VMOVDQU64 Z5, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z6, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z7, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z8, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z9, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x5_64_loop
VZEROUPPER
mulGFNI_1x5_64_end:
RET
// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x5(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, DX
// Add start offset to input
ADDQ R9, CX
mulAvxGFNI_1x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (CX), Y9
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y9, Y5
VGF2P8AFFINEQB $0x00, Y1, Y9, Y6
VGF2P8AFFINEQB $0x00, Y2, Y9, Y7
VGF2P8AFFINEQB $0x00, Y3, Y9, Y8
VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
// Store 5 outputs
VMOVDQU Y5, (BX)
ADDQ $0x20, BX
VMOVDQU Y6, (SI)
ADDQ $0x20, SI
VMOVDQU Y7, (DI)
ADDQ $0x20, DI
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x5_loop
VZEROUPPER
mulAvxGFNI_1x5_end:
RET
// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x5_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, DX
// Add start offset to input
ADDQ R9, CX
mulGFNI_1x5_64Xor_loop:
// Load 5 outputs
VMOVDQU64 (BX), Z5
VMOVDQU64 (SI), Z6
VMOVDQU64 (DI), Z7
VMOVDQU64 (R8), Z8
VMOVDQU64 (DX), Z9
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (CX), Z10
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z10, Z11
VXORPD Z5, Z11, Z5
VGF2P8AFFINEQB $0x00, Z1, Z10, Z11
VXORPD Z6, Z11, Z6
VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
VXORPD Z7, Z11, Z7
VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
VXORPD Z8, Z11, Z8
VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
VXORPD Z9, Z11, Z9
// Store 5 outputs
VMOVDQU64 Z5, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z6, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z7, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z8, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z9, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x5_64Xor_loop
VZEROUPPER
mulGFNI_1x5_64Xor_end:
RET
// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, DX
// Add start offset to input
ADDQ R9, CX
mulAvxGFNI_1x5Xor_loop:
// Load 5 outputs
VMOVDQU (BX), Y5
VMOVDQU (SI), Y6
VMOVDQU (DI), Y7
VMOVDQU (R8), Y8
VMOVDQU (DX), Y9
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (CX), Y10
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y10, Y11
VXORPD Y5, Y11, Y5
VGF2P8AFFINEQB $0x00, Y1, Y10, Y11
VXORPD Y6, Y11, Y6
VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
VXORPD Y7, Y11, Y7
VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
VXORPD Y8, Y11, Y8
VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
VXORPD Y9, Y11, Y9
// Store 5 outputs
VMOVDQU Y5, (BX)
ADDQ $0x20, BX
VMOVDQU Y6, (SI)
ADDQ $0x20, SI
VMOVDQU Y7, (DI)
ADDQ $0x20, DI
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x5Xor_loop
VZEROUPPER
mulAvxGFNI_1x5Xor_end:
RET
// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x6_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, DX
// Add start offset to input
ADDQ R10, CX
mulGFNI_1x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (CX), Z11
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z11, Z6
VGF2P8AFFINEQB $0x00, Z1, Z11, Z7
VGF2P8AFFINEQB $0x00, Z2, Z11, Z8
VGF2P8AFFINEQB $0x00, Z3, Z11, Z9
VGF2P8AFFINEQB $0x00, Z4, Z11, Z10
VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
// Store 6 outputs
VMOVDQU64 Z6, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z7, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z8, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z9, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z10, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z11, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x6_64_loop
VZEROUPPER
mulGFNI_1x6_64_end:
RET
// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x6(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, DX
// Add start offset to input
ADDQ R10, CX
mulAvxGFNI_1x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (CX), Y11
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y11, Y6
VGF2P8AFFINEQB $0x00, Y1, Y11, Y7
VGF2P8AFFINEQB $0x00, Y2, Y11, Y8
VGF2P8AFFINEQB $0x00, Y3, Y11, Y9
VGF2P8AFFINEQB $0x00, Y4, Y11, Y10
VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
// Store 6 outputs
VMOVDQU Y6, (BX)
ADDQ $0x20, BX
VMOVDQU Y7, (SI)
ADDQ $0x20, SI
VMOVDQU Y8, (DI)
ADDQ $0x20, DI
VMOVDQU Y9, (R8)
ADDQ $0x20, R8
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x6_loop
VZEROUPPER
mulAvxGFNI_1x6_end:
RET
// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x6_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, DX
// Add start offset to input
ADDQ R10, CX
mulGFNI_1x6_64Xor_loop:
// Load 6 outputs
VMOVDQU64 (BX), Z6
VMOVDQU64 (SI), Z7
VMOVDQU64 (DI), Z8
VMOVDQU64 (R8), Z9
VMOVDQU64 (R9), Z10
VMOVDQU64 (DX), Z11
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (CX), Z12
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
VXORPD Z6, Z13, Z6
VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
VXORPD Z7, Z13, Z7
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
VXORPD Z8, Z13, Z8
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
VXORPD Z11, Z13, Z11
// Store 6 outputs
VMOVDQU64 Z6, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z7, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z8, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z9, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z10, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z11, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x6_64Xor_loop
VZEROUPPER
mulGFNI_1x6_64Xor_end:
RET
// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, DX
// Add start offset to input
ADDQ R10, CX
mulAvxGFNI_1x6Xor_loop:
// Load 6 outputs
VMOVDQU (BX), Y6
VMOVDQU (SI), Y7
VMOVDQU (DI), Y8
VMOVDQU (R8), Y9
VMOVDQU (R9), Y10
VMOVDQU (DX), Y11
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (CX), Y12
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
VXORPD Y6, Y13, Y6
VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
VXORPD Y7, Y13, Y7
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
VXORPD Y8, Y13, Y8
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
VXORPD Y11, Y13, Y11
// Store 6 outputs
VMOVDQU Y6, (BX)
ADDQ $0x20, BX
VMOVDQU Y7, (SI)
ADDQ $0x20, SI
VMOVDQU Y8, (DI)
ADDQ $0x20, DI
VMOVDQU Y9, (R8)
ADDQ $0x20, R8
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x6Xor_loop
VZEROUPPER
mulAvxGFNI_1x6Xor_end:
RET
// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x7_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 16 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, DX
// Add start offset to input
ADDQ R11, CX
mulGFNI_1x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (CX), Z13
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z13, Z7
VGF2P8AFFINEQB $0x00, Z1, Z13, Z8
VGF2P8AFFINEQB $0x00, Z2, Z13, Z9
VGF2P8AFFINEQB $0x00, Z3, Z13, Z10
VGF2P8AFFINEQB $0x00, Z4, Z13, Z11
VGF2P8AFFINEQB $0x00, Z5, Z13, Z12
VGF2P8AFFINEQB $0x00, Z6, Z13, Z13
// Store 7 outputs
VMOVDQU64 Z7, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z8, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z9, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z10, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z11, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z12, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z13, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x7_64_loop
VZEROUPPER
mulGFNI_1x7_64_end:
RET
// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x7(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 16 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, DX
// Add start offset to input
ADDQ R11, CX
mulAvxGFNI_1x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (CX), Y13
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y13, Y7
VGF2P8AFFINEQB $0x00, Y1, Y13, Y8
VGF2P8AFFINEQB $0x00, Y2, Y13, Y9
VGF2P8AFFINEQB $0x00, Y3, Y13, Y10
VGF2P8AFFINEQB $0x00, Y4, Y13, Y11
VGF2P8AFFINEQB $0x00, Y5, Y13, Y12
VGF2P8AFFINEQB $0x00, Y6, Y13, Y13
// Store 7 outputs
VMOVDQU Y7, (BX)
ADDQ $0x20, BX
VMOVDQU Y8, (SI)
ADDQ $0x20, SI
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (R9)
ADDQ $0x20, R9
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x7_loop
VZEROUPPER
mulAvxGFNI_1x7_end:
RET
// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x7_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 16 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, DX
// Add start offset to input
ADDQ R11, CX
mulGFNI_1x7_64Xor_loop:
// Load 7 outputs
VMOVDQU64 (BX), Z7
VMOVDQU64 (SI), Z8
VMOVDQU64 (DI), Z9
VMOVDQU64 (R8), Z10
VMOVDQU64 (R9), Z11
VMOVDQU64 (R10), Z12
VMOVDQU64 (DX), Z13
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (CX), Z14
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z14, Z15
VXORPD Z7, Z15, Z7
VGF2P8AFFINEQB $0x00, Z1, Z14, Z15
VXORPD Z8, Z15, Z8
VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
VXORPD Z9, Z15, Z9
VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
VXORPD Z10, Z15, Z10
VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
VXORPD Z11, Z15, Z11
VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
VXORPD Z13, Z15, Z13
// Store 7 outputs
VMOVDQU64 Z7, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z8, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z9, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z10, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z11, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z12, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z13, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x7_64Xor_loop
VZEROUPPER
mulGFNI_1x7_64Xor_end:
RET
// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 16 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, DX
// Add start offset to input
ADDQ R11, CX
mulAvxGFNI_1x7Xor_loop:
// Load 7 outputs
VMOVDQU (BX), Y7
VMOVDQU (SI), Y8
VMOVDQU (DI), Y9
VMOVDQU (R8), Y10
VMOVDQU (R9), Y11
VMOVDQU (R10), Y12
VMOVDQU (DX), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (CX), Y14
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (BX)
ADDQ $0x20, BX
VMOVDQU Y8, (SI)
ADDQ $0x20, SI
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (R9)
ADDQ $0x20, R9
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (DX)
ADDQ $0x20, DX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x7Xor_loop
VZEROUPPER
mulAvxGFNI_1x7Xor_end:
RET
// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x8_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, DX
// Add start offset to input
ADDQ R12, CX
mulGFNI_1x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (CX), Z15
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z15, Z8
VGF2P8AFFINEQB $0x00, Z1, Z15, Z9
VGF2P8AFFINEQB $0x00, Z2, Z15, Z10
VGF2P8AFFINEQB $0x00, Z3, Z15, Z11
VGF2P8AFFINEQB $0x00, Z4, Z15, Z12
VGF2P8AFFINEQB $0x00, Z5, Z15, Z13
VGF2P8AFFINEQB $0x00, Z6, Z15, Z14
VGF2P8AFFINEQB $0x00, Z7, Z15, Z15
// Store 8 outputs
VMOVDQU64 Z8, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z9, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z10, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z11, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z12, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z13, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z14, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z15, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x8_64_loop
VZEROUPPER
mulGFNI_1x8_64_end:
RET
// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x8(SB), $0-88
// Loading 6 of 8 tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), DX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), BX
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, BX
// Add start offset to input
ADDQ R13, DX
mulAvxGFNI_1x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (DX), Y13
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y13, Y6
VGF2P8AFFINEQB $0x00, Y1, Y13, Y7
VGF2P8AFFINEQB $0x00, Y2, Y13, Y8
VGF2P8AFFINEQB $0x00, Y3, Y13, Y9
VGF2P8AFFINEQB $0x00, Y4, Y13, Y10
VGF2P8AFFINEQB $0x00, Y5, Y13, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
VBROADCASTSD 56(CX), Y14
VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
// Store 8 outputs
VMOVDQU Y6, (SI)
ADDQ $0x20, SI
VMOVDQU Y7, (DI)
ADDQ $0x20, DI
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (R9)
ADDQ $0x20, R9
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x8_loop
VZEROUPPER
mulAvxGFNI_1x8_end:
RET
// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x8_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, DX
// Add start offset to input
ADDQ R12, CX
mulGFNI_1x8_64Xor_loop:
// Load 8 outputs
VMOVDQU64 (BX), Z8
VMOVDQU64 (SI), Z9
VMOVDQU64 (DI), Z10
VMOVDQU64 (R8), Z11
VMOVDQU64 (R9), Z12
VMOVDQU64 (R10), Z13
VMOVDQU64 (R11), Z14
VMOVDQU64 (DX), Z15
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (CX), Z16
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
VXORPD Z8, Z17, Z8
VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
VXORPD Z9, Z17, Z9
VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
VXORPD Z10, Z17, Z10
VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
VXORPD Z11, Z17, Z11
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
VXORPD Z12, Z17, Z12
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
VXORPD Z13, Z17, Z13
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
VXORPD Z15, Z17, Z15
// Store 8 outputs
VMOVDQU64 Z8, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z9, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z10, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z11, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z12, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z13, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z14, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z15, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x8_64Xor_loop
VZEROUPPER
mulGFNI_1x8_64Xor_end:
RET
// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88
// Loading 6 of 8 tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), DX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), BX
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, BX
// Add start offset to input
ADDQ R13, DX
mulAvxGFNI_1x8Xor_loop:
// Load 8 outputs
VMOVDQU (SI), Y6
VMOVDQU (DI), Y7
VMOVDQU (R8), Y8
VMOVDQU (R9), Y9
VMOVDQU (R10), Y10
VMOVDQU (R11), Y11
VMOVDQU (R12), Y12
VMOVDQU (BX), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (SI)
ADDQ $0x20, SI
VMOVDQU Y7, (DI)
ADDQ $0x20, DI
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (R9)
ADDQ $0x20, R9
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x8Xor_loop
VZEROUPPER
mulAvxGFNI_1x8Xor_end:
RET
// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x9_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, DX
// Add start offset to input
ADDQ R13, CX
mulGFNI_1x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (CX), Z17
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z17, Z9
VGF2P8AFFINEQB $0x00, Z1, Z17, Z10
VGF2P8AFFINEQB $0x00, Z2, Z17, Z11
VGF2P8AFFINEQB $0x00, Z3, Z17, Z12
VGF2P8AFFINEQB $0x00, Z4, Z17, Z13
VGF2P8AFFINEQB $0x00, Z5, Z17, Z14
VGF2P8AFFINEQB $0x00, Z6, Z17, Z15
VGF2P8AFFINEQB $0x00, Z7, Z17, Z16
VGF2P8AFFINEQB $0x00, Z8, Z17, Z17
// Store 9 outputs
VMOVDQU64 Z9, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z10, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z11, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z12, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z13, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z14, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z15, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z16, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z17, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x9_64_loop
VZEROUPPER
mulGFNI_1x9_64_end:
RET
// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x9(SB), $0-88
// Loading 5 of 9 tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), DX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), R13
MOVQ 192(BX), BX
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, BX
// Add start offset to input
ADDQ R14, DX
mulAvxGFNI_1x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (DX), Y13
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y13, Y5
VGF2P8AFFINEQB $0x00, Y1, Y13, Y6
VGF2P8AFFINEQB $0x00, Y2, Y13, Y7
VGF2P8AFFINEQB $0x00, Y3, Y13, Y8
VGF2P8AFFINEQB $0x00, Y4, Y13, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y13, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y13, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
VBROADCASTSD 64(CX), Y14
VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
// Store 9 outputs
VMOVDQU Y5, (SI)
ADDQ $0x20, SI
VMOVDQU Y6, (DI)
ADDQ $0x20, DI
VMOVDQU Y7, (R8)
ADDQ $0x20, R8
VMOVDQU Y8, (R9)
ADDQ $0x20, R9
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x9_loop
VZEROUPPER
mulAvxGFNI_1x9_end:
RET
// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x9_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, DX
// Add start offset to input
ADDQ R13, CX
mulGFNI_1x9_64Xor_loop:
// Load 9 outputs
VMOVDQU64 (BX), Z9
VMOVDQU64 (SI), Z10
VMOVDQU64 (DI), Z11
VMOVDQU64 (R8), Z12
VMOVDQU64 (R9), Z13
VMOVDQU64 (R10), Z14
VMOVDQU64 (R11), Z15
VMOVDQU64 (R12), Z16
VMOVDQU64 (DX), Z17
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (CX), Z18
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
VXORPD Z9, Z19, Z9
VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
VXORPD Z10, Z19, Z10
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
VXORPD Z11, Z19, Z11
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
VXORPD Z12, Z19, Z12
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
VXORPD Z13, Z19, Z13
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
VXORPD Z14, Z19, Z14
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
VXORPD Z17, Z19, Z17
// Store 9 outputs
VMOVDQU64 Z9, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z10, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z11, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z12, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z13, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z14, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z15, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z16, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z17, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x9_64Xor_loop
VZEROUPPER
mulGFNI_1x9_64Xor_end:
RET
// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88
// Loading 5 of 9 tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), DX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), R13
MOVQ 192(BX), BX
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, BX
// Add start offset to input
ADDQ R14, DX
mulAvxGFNI_1x9Xor_loop:
// Load 9 outputs
VMOVDQU (SI), Y5
VMOVDQU (DI), Y6
VMOVDQU (R8), Y7
VMOVDQU (R9), Y8
VMOVDQU (R10), Y9
VMOVDQU (R11), Y10
VMOVDQU (R12), Y11
VMOVDQU (R13), Y12
VMOVDQU (BX), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
VMOVDQU Y5, (SI)
ADDQ $0x20, SI
VMOVDQU Y6, (DI)
ADDQ $0x20, DI
VMOVDQU Y7, (R8)
ADDQ $0x20, R8
VMOVDQU Y8, (R9)
ADDQ $0x20, R9
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x9Xor_loop
VZEROUPPER
mulAvxGFNI_1x9Xor_end:
RET
// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x10_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, DX
// Add start offset to input
ADDQ R14, CX
mulGFNI_1x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (CX), Z19
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z19, Z10
VGF2P8AFFINEQB $0x00, Z1, Z19, Z11
VGF2P8AFFINEQB $0x00, Z2, Z19, Z12
VGF2P8AFFINEQB $0x00, Z3, Z19, Z13
VGF2P8AFFINEQB $0x00, Z4, Z19, Z14
VGF2P8AFFINEQB $0x00, Z5, Z19, Z15
VGF2P8AFFINEQB $0x00, Z6, Z19, Z16
VGF2P8AFFINEQB $0x00, Z7, Z19, Z17
VGF2P8AFFINEQB $0x00, Z8, Z19, Z18
VGF2P8AFFINEQB $0x00, Z9, Z19, Z19
// Store 10 outputs
VMOVDQU64 Z10, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z11, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z12, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z13, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z14, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z15, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z16, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z17, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z18, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z19, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x10_64_loop
VZEROUPPER
mulGFNI_1x10_64_end:
RET
// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x10(SB), $0-88
// Loading 4 of 10 tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), DX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), R13
MOVQ 192(BX), R14
MOVQ 216(BX), BX
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, BX
// Add start offset to input
ADDQ R15, DX
mulAvxGFNI_1x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (DX), Y13
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y13, Y4
VGF2P8AFFINEQB $0x00, Y1, Y13, Y5
VGF2P8AFFINEQB $0x00, Y2, Y13, Y6
VGF2P8AFFINEQB $0x00, Y3, Y13, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y13, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y13, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y13, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y13, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y13, Y12
VBROADCASTSD 72(CX), Y14
VGF2P8AFFINEQB $0x00, Y14, Y13, Y13
// Store 10 outputs
VMOVDQU Y4, (SI)
ADDQ $0x20, SI
VMOVDQU Y5, (DI)
ADDQ $0x20, DI
VMOVDQU Y6, (R8)
ADDQ $0x20, R8
VMOVDQU Y7, (R9)
ADDQ $0x20, R9
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x10_loop
VZEROUPPER
mulAvxGFNI_1x10_end:
RET
// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_1x10_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_1x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
MOVQ in_base+24(FP), CX
MOVQ (CX), CX
MOVQ out_base+48(FP), DX
MOVQ out_base+48(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, DX
// Add start offset to input
ADDQ R14, CX
mulGFNI_1x10_64Xor_loop:
// Load 10 outputs
VMOVDQU64 (BX), Z10
VMOVDQU64 (SI), Z11
VMOVDQU64 (DI), Z12
VMOVDQU64 (R8), Z13
VMOVDQU64 (R9), Z14
VMOVDQU64 (R10), Z15
VMOVDQU64 (R11), Z16
VMOVDQU64 (R12), Z17
VMOVDQU64 (R13), Z18
VMOVDQU64 (DX), Z19
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (CX), Z20
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
VXORPD Z10, Z21, Z10
VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
VXORPD Z11, Z21, Z11
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
VXORPD Z12, Z21, Z12
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
VXORPD Z13, Z21, Z13
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
VXORPD Z14, Z21, Z14
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
VXORPD Z15, Z21, Z15
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
VXORPD Z19, Z21, Z19
// Store 10 outputs
VMOVDQU64 Z10, (BX)
ADDQ $0x40, BX
VMOVDQU64 Z11, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z12, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z13, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z14, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z15, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z16, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z17, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z18, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z19, (DX)
ADDQ $0x40, DX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_1x10_64Xor_loop
VZEROUPPER
mulGFNI_1x10_64Xor_end:
RET
// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88
// Loading 4 of 10 tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_1x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), DX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), R13
MOVQ 192(BX), R14
MOVQ 216(BX), BX
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, BX
// Add start offset to input
ADDQ R15, DX
mulAvxGFNI_1x10Xor_loop:
// Load 10 outputs
VMOVDQU (SI), Y4
VMOVDQU (DI), Y5
VMOVDQU (R8), Y6
VMOVDQU (R9), Y7
VMOVDQU (R10), Y8
VMOVDQU (R11), Y9
VMOVDQU (R12), Y10
VMOVDQU (R13), Y11
VMOVDQU (R14), Y12
VMOVDQU (BX), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
VMOVDQU Y4, (SI)
ADDQ $0x20, SI
VMOVDQU Y5, (DI)
ADDQ $0x20, DI
VMOVDQU Y6, (R8)
ADDQ $0x20, R8
VMOVDQU Y7, (R9)
ADDQ $0x20, R9
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_1x10Xor_loop
VZEROUPPER
mulAvxGFNI_1x10Xor_end:
RET
// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 5 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), BX
MOVQ start+72(FP), SI
// Add start offset to output
ADDQ SI, BX
// Add start offset to input
ADDQ SI, DX
ADDQ SI, CX
mulGFNI_2x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z3
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z3, Z2
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (CX), Z3
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
VXORPD Z2, Z3, Z2
// Store 1 outputs
VMOVDQU64 Z2, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x1_64_loop
VZEROUPPER
mulGFNI_2x1_64_end:
RET
// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 5 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), BX
MOVQ start+72(FP), SI
// Add start offset to output
ADDQ SI, BX
// Add start offset to input
ADDQ SI, DX
ADDQ SI, CX
mulAvxGFNI_2x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y3
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y3, Y2
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (CX), Y3
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
VXORPD Y2, Y3, Y2
// Store 1 outputs
VMOVDQU Y2, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x1_loop
VZEROUPPER
mulAvxGFNI_2x1_end:
RET
// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 5 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), BX
MOVQ start+72(FP), SI
// Add start offset to output
ADDQ SI, BX
// Add start offset to input
ADDQ SI, DX
ADDQ SI, CX
mulGFNI_2x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (BX), Z2
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z3
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z3, Z3
VXORPD Z2, Z3, Z2
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (CX), Z3
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z1, Z3, Z3
VXORPD Z2, Z3, Z2
// Store 1 outputs
VMOVDQU64 Z2, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x1_64Xor_loop
VZEROUPPER
mulGFNI_2x1_64Xor_end:
RET
// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 5 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), BX
MOVQ start+72(FP), SI
// Add start offset to output
ADDQ SI, BX
// Add start offset to input
ADDQ SI, DX
ADDQ SI, CX
mulAvxGFNI_2x1Xor_loop:
// Load 1 outputs
VMOVDQU (BX), Y2
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y3
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y3, Y3
VXORPD Y2, Y3, Y2
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (CX), Y3
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y1, Y3, Y3
VXORPD Y2, Y3, Y2
// Store 1 outputs
VMOVDQU Y2, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x1Xor_loop
VZEROUPPER
mulAvxGFNI_2x1Xor_end:
RET
// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), BX
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, SI
ADDQ DI, BX
// Add start offset to input
ADDQ DI, DX
ADDQ DI, CX
mulGFNI_2x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z6
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z6, Z4
VGF2P8AFFINEQB $0x00, Z1, Z6, Z5
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (CX), Z6
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
VXORPD Z4, Z7, Z4
VGF2P8AFFINEQB $0x00, Z3, Z6, Z7
VXORPD Z5, Z7, Z5
// Store 2 outputs
VMOVDQU64 Z4, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z5, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x2_64_loop
VZEROUPPER
mulGFNI_2x2_64_end:
RET
// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x2(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), BX
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, SI
ADDQ DI, BX
// Add start offset to input
ADDQ DI, DX
ADDQ DI, CX
mulAvxGFNI_2x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y6
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y6, Y4
VGF2P8AFFINEQB $0x00, Y1, Y6, Y5
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (CX), Y6
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
VXORPD Y4, Y7, Y4
VGF2P8AFFINEQB $0x00, Y3, Y6, Y7
VXORPD Y5, Y7, Y5
// Store 2 outputs
VMOVDQU Y4, (SI)
ADDQ $0x20, SI
VMOVDQU Y5, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x2_loop
VZEROUPPER
mulAvxGFNI_2x2_end:
RET
// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), BX
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, SI
ADDQ DI, BX
// Add start offset to input
ADDQ DI, DX
ADDQ DI, CX
mulGFNI_2x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (SI), Z4
VMOVDQU64 (BX), Z5
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z6
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z6, Z7
VXORPD Z4, Z7, Z4
VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
VXORPD Z5, Z7, Z5
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (CX), Z6
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
VXORPD Z4, Z7, Z4
VGF2P8AFFINEQB $0x00, Z3, Z6, Z7
VXORPD Z5, Z7, Z5
// Store 2 outputs
VMOVDQU64 Z4, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z5, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x2_64Xor_loop
VZEROUPPER
mulGFNI_2x2_64Xor_end:
RET
// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), BX
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, SI
ADDQ DI, BX
// Add start offset to input
ADDQ DI, DX
ADDQ DI, CX
mulAvxGFNI_2x2Xor_loop:
// Load 2 outputs
VMOVDQU (SI), Y4
VMOVDQU (BX), Y5
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y6
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y6, Y7
VXORPD Y4, Y7, Y4
VGF2P8AFFINEQB $0x00, Y1, Y6, Y7
VXORPD Y5, Y7, Y5
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (CX), Y6
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y2, Y6, Y7
VXORPD Y4, Y7, Y4
VGF2P8AFFINEQB $0x00, Y3, Y6, Y7
VXORPD Y5, Y7, Y5
// Store 2 outputs
VMOVDQU Y4, (SI)
ADDQ $0x20, SI
VMOVDQU Y5, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x2Xor_loop
VZEROUPPER
mulAvxGFNI_2x2Xor_end:
RET
// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 11 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), BX
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, SI
ADDQ R8, DI
ADDQ R8, BX
// Add start offset to input
ADDQ R8, DX
ADDQ R8, CX
mulGFNI_2x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z9
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z9, Z6
VGF2P8AFFINEQB $0x00, Z1, Z9, Z7
VGF2P8AFFINEQB $0x00, Z2, Z9, Z8
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (CX), Z9
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z3, Z9, Z10
VXORPD Z6, Z10, Z6
VGF2P8AFFINEQB $0x00, Z4, Z9, Z10
VXORPD Z7, Z10, Z7
VGF2P8AFFINEQB $0x00, Z5, Z9, Z10
VXORPD Z8, Z10, Z8
// Store 3 outputs
VMOVDQU64 Z6, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z7, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z8, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x3_64_loop
VZEROUPPER
mulGFNI_2x3_64_end:
RET
// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x3(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 11 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), BX
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, SI
ADDQ R8, DI
ADDQ R8, BX
// Add start offset to input
ADDQ R8, DX
ADDQ R8, CX
mulAvxGFNI_2x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (DX), Y9
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y9, Y6
VGF2P8AFFINEQB $0x00, Y1, Y9, Y7
VGF2P8AFFINEQB $0x00, Y2, Y9, Y8
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (CX), Y9
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y3, Y9, Y10
VXORPD Y6, Y10, Y6
VGF2P8AFFINEQB $0x00, Y4, Y9, Y10
VXORPD Y7, Y10, Y7
VGF2P8AFFINEQB $0x00, Y5, Y9, Y10
VXORPD Y8, Y10, Y8
// Store 3 outputs
VMOVDQU Y6, (SI)
ADDQ $0x20, SI
VMOVDQU Y7, (DI)
ADDQ $0x20, DI
VMOVDQU Y8, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x3_loop
VZEROUPPER
mulAvxGFNI_2x3_end:
RET
// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 11 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), BX
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, SI
ADDQ R8, DI
ADDQ R8, BX
// Add start offset to input
ADDQ R8, DX
ADDQ R8, CX
mulGFNI_2x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (SI), Z6
VMOVDQU64 (DI), Z7
VMOVDQU64 (BX), Z8
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z9
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z9, Z10
VXORPD Z6, Z10, Z6
VGF2P8AFFINEQB $0x00, Z1, Z9, Z10
VXORPD Z7, Z10, Z7
VGF2P8AFFINEQB $0x00, Z2, Z9, Z10
VXORPD Z8, Z10, Z8
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (CX), Z9
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z3, Z9, Z10
VXORPD Z6, Z10, Z6
VGF2P8AFFINEQB $0x00, Z4, Z9, Z10
VXORPD Z7, Z10, Z7
VGF2P8AFFINEQB $0x00, Z5, Z9, Z10
VXORPD Z8, Z10, Z8
// Store 3 outputs
VMOVDQU64 Z6, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z7, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z8, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x3_64Xor_loop
VZEROUPPER
mulGFNI_2x3_64Xor_end:
RET
// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 11 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), BX
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, SI
ADDQ R8, DI
ADDQ R8, BX
// Add start offset to input
ADDQ R8, DX
ADDQ R8, CX
mulAvxGFNI_2x3Xor_loop:
// Load 3 outputs
VMOVDQU (SI), Y6
VMOVDQU (DI), Y7
VMOVDQU (BX), Y8
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (DX), Y9
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y9, Y10
VXORPD Y6, Y10, Y6
VGF2P8AFFINEQB $0x00, Y1, Y9, Y10
VXORPD Y7, Y10, Y7
VGF2P8AFFINEQB $0x00, Y2, Y9, Y10
VXORPD Y8, Y10, Y8
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (CX), Y9
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y3, Y9, Y10
VXORPD Y6, Y10, Y6
VGF2P8AFFINEQB $0x00, Y4, Y9, Y10
VXORPD Y7, Y10, Y7
VGF2P8AFFINEQB $0x00, Y5, Y9, Y10
VXORPD Y8, Y10, Y8
// Store 3 outputs
VMOVDQU Y6, (SI)
ADDQ $0x20, SI
VMOVDQU Y7, (DI)
ADDQ $0x20, DI
VMOVDQU Y8, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x3Xor_loop
VZEROUPPER
mulAvxGFNI_2x3Xor_end:
RET
// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x4_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), BX
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, BX
// Add start offset to input
ADDQ R9, DX
ADDQ R9, CX
mulGFNI_2x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z12
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z12, Z8
VGF2P8AFFINEQB $0x00, Z1, Z12, Z9
VGF2P8AFFINEQB $0x00, Z2, Z12, Z10
VGF2P8AFFINEQB $0x00, Z3, Z12, Z11
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (CX), Z12
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
VXORPD Z8, Z13, Z8
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
VXORPD Z11, Z13, Z11
// Store 4 outputs
VMOVDQU64 Z8, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z9, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z10, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z11, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x4_64_loop
VZEROUPPER
mulGFNI_2x4_64_end:
RET
// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x4(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), BX
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, BX
// Add start offset to input
ADDQ R9, DX
ADDQ R9, CX
mulAvxGFNI_2x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (DX), Y12
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y12, Y8
VGF2P8AFFINEQB $0x00, Y1, Y12, Y9
VGF2P8AFFINEQB $0x00, Y2, Y12, Y10
VGF2P8AFFINEQB $0x00, Y3, Y12, Y11
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (CX), Y12
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
VXORPD Y8, Y13, Y8
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
VXORPD Y11, Y13, Y11
// Store 4 outputs
VMOVDQU Y8, (SI)
ADDQ $0x20, SI
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x4_loop
VZEROUPPER
mulAvxGFNI_2x4_end:
RET
// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x4_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), BX
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, BX
// Add start offset to input
ADDQ R9, DX
ADDQ R9, CX
mulGFNI_2x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (SI), Z8
VMOVDQU64 (DI), Z9
VMOVDQU64 (R8), Z10
VMOVDQU64 (BX), Z11
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z12
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
VXORPD Z8, Z13, Z8
VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (CX), Z12
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
VXORPD Z8, Z13, Z8
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
VXORPD Z11, Z13, Z11
// Store 4 outputs
VMOVDQU64 Z8, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z9, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z10, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z11, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x4_64Xor_loop
VZEROUPPER
mulGFNI_2x4_64Xor_end:
RET
// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), BX
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, BX
// Add start offset to input
ADDQ R9, DX
ADDQ R9, CX
mulAvxGFNI_2x4Xor_loop:
// Load 4 outputs
VMOVDQU (SI), Y8
VMOVDQU (DI), Y9
VMOVDQU (R8), Y10
VMOVDQU (BX), Y11
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (DX), Y12
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
VXORPD Y8, Y13, Y8
VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (CX), Y12
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
VXORPD Y8, Y13, Y8
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
VXORPD Y11, Y13, Y11
// Store 4 outputs
VMOVDQU Y8, (SI)
ADDQ $0x20, SI
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (BX)
ADDQ $0x20, BX
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x4Xor_loop
VZEROUPPER
mulAvxGFNI_2x4Xor_end:
RET
// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x5_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 17 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), BX
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, BX
// Add start offset to input
ADDQ R10, DX
ADDQ R10, CX
mulGFNI_2x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z15
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z15, Z10
VGF2P8AFFINEQB $0x00, Z1, Z15, Z11
VGF2P8AFFINEQB $0x00, Z2, Z15, Z12
VGF2P8AFFINEQB $0x00, Z3, Z15, Z13
VGF2P8AFFINEQB $0x00, Z4, Z15, Z14
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (CX), Z15
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
VXORPD Z10, Z16, Z10
VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
VXORPD Z11, Z16, Z11
VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
VXORPD Z14, Z16, Z14
// Store 5 outputs
VMOVDQU64 Z10, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z11, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z12, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z13, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z14, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x5_64_loop
VZEROUPPER
mulGFNI_2x5_64_end:
RET
// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x5(SB), $0-88
// Loading 9 of 10 tables to registers
// Destination kept in GP registers
// Full registers estimated 17 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), SI
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, SI
// Add start offset to input
ADDQ R11, BX
ADDQ R11, DX
mulAvxGFNI_2x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (R9)
ADDQ $0x20, R9
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x5_loop
VZEROUPPER
mulAvxGFNI_2x5_end:
RET
// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x5_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 17 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), BX
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, BX
// Add start offset to input
ADDQ R10, DX
ADDQ R10, CX
mulGFNI_2x5_64Xor_loop:
// Load 5 outputs
VMOVDQU64 (SI), Z10
VMOVDQU64 (DI), Z11
VMOVDQU64 (R8), Z12
VMOVDQU64 (R9), Z13
VMOVDQU64 (BX), Z14
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z15
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z15, Z16
VXORPD Z10, Z16, Z10
VGF2P8AFFINEQB $0x00, Z1, Z15, Z16
VXORPD Z11, Z16, Z11
VGF2P8AFFINEQB $0x00, Z2, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
VXORPD Z14, Z16, Z14
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (CX), Z15
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
VXORPD Z10, Z16, Z10
VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
VXORPD Z11, Z16, Z11
VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
VXORPD Z14, Z16, Z14
// Store 5 outputs
VMOVDQU64 Z10, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z11, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z12, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z13, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z14, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x5_64Xor_loop
VZEROUPPER
mulGFNI_2x5_64Xor_end:
RET
// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88
// Loading 9 of 10 tables to registers
// Destination kept in GP registers
// Full registers estimated 17 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), SI
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, SI
// Add start offset to input
ADDQ R11, BX
ADDQ R11, DX
mulAvxGFNI_2x5Xor_loop:
// Load 5 outputs
VMOVDQU (DI), Y9
VMOVDQU (R8), Y10
VMOVDQU (R9), Y11
VMOVDQU (R10), Y12
VMOVDQU (SI), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (R9)
ADDQ $0x20, R9
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x5Xor_loop
VZEROUPPER
mulAvxGFNI_2x5Xor_end:
RET
// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x6_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), BX
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, BX
// Add start offset to input
ADDQ R11, DX
ADDQ R11, CX
mulGFNI_2x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (DX), Z18
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z18, Z12
VGF2P8AFFINEQB $0x00, Z1, Z18, Z13
VGF2P8AFFINEQB $0x00, Z2, Z18, Z14
VGF2P8AFFINEQB $0x00, Z3, Z18, Z15
VGF2P8AFFINEQB $0x00, Z4, Z18, Z16
VGF2P8AFFINEQB $0x00, Z5, Z18, Z17
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (CX), Z18
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
VXORPD Z12, Z19, Z12
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
VXORPD Z13, Z19, Z13
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
VXORPD Z14, Z19, Z14
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
VXORPD Z17, Z19, Z17
// Store 6 outputs
VMOVDQU64 Z12, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z13, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z14, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z15, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z16, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z17, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x6_64_loop
VZEROUPPER
mulGFNI_2x6_64_end:
RET
// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x6(SB), $0-88
// Loading 8 of 12 tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), SI
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, SI
// Add start offset to input
ADDQ R12, BX
ADDQ R12, DX
mulAvxGFNI_2x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (DI)
ADDQ $0x20, DI
VMOVDQU Y9, (R8)
ADDQ $0x20, R8
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (R10)
ADDQ $0x20, R10
VMOVDQU Y12, (R11)
ADDQ $0x20, R11
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x6_loop
VZEROUPPER
mulAvxGFNI_2x6_end:
RET
// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x6_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), BX
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, BX
// Add start offset to input
ADDQ R11, DX
ADDQ R11, CX
mulGFNI_2x6_64Xor_loop:
// Load 6 outputs
VMOVDQU64 (SI), Z12
VMOVDQU64 (DI), Z13
VMOVDQU64 (R8), Z14
VMOVDQU64 (R9), Z15
VMOVDQU64 (R10), Z16
VMOVDQU64 (BX), Z17
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (DX), Z18
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
VXORPD Z12, Z19, Z12
VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
VXORPD Z13, Z19, Z13
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
VXORPD Z14, Z19, Z14
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (CX), Z18
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
VXORPD Z12, Z19, Z12
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
VXORPD Z13, Z19, Z13
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
VXORPD Z14, Z19, Z14
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
VXORPD Z17, Z19, Z17
// Store 6 outputs
VMOVDQU64 Z12, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z13, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z14, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z15, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z16, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z17, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x6_64Xor_loop
VZEROUPPER
mulGFNI_2x6_64Xor_end:
RET
// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88
// Loading 8 of 12 tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), SI
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, SI
// Add start offset to input
ADDQ R12, BX
ADDQ R12, DX
mulAvxGFNI_2x6Xor_loop:
// Load 6 outputs
VMOVDQU (DI), Y8
VMOVDQU (R8), Y9
VMOVDQU (R9), Y10
VMOVDQU (R10), Y11
VMOVDQU (R11), Y12
VMOVDQU (SI), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (DI)
ADDQ $0x20, DI
VMOVDQU Y9, (R8)
ADDQ $0x20, R8
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (R10)
ADDQ $0x20, R10
VMOVDQU Y12, (R11)
ADDQ $0x20, R11
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x6Xor_loop
VZEROUPPER
mulAvxGFNI_2x6Xor_end:
RET
// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x7_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 23 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), BX
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, BX
// Add start offset to input
ADDQ R12, DX
ADDQ R12, CX
mulGFNI_2x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (DX), Z21
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z21, Z14
VGF2P8AFFINEQB $0x00, Z1, Z21, Z15
VGF2P8AFFINEQB $0x00, Z2, Z21, Z16
VGF2P8AFFINEQB $0x00, Z3, Z21, Z17
VGF2P8AFFINEQB $0x00, Z4, Z21, Z18
VGF2P8AFFINEQB $0x00, Z5, Z21, Z19
VGF2P8AFFINEQB $0x00, Z6, Z21, Z20
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (CX), Z21
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
VXORPD Z14, Z22, Z14
VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
VXORPD Z15, Z22, Z15
VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
VXORPD Z16, Z22, Z16
VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
VXORPD Z17, Z22, Z17
VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
VXORPD Z20, Z22, Z20
// Store 7 outputs
VMOVDQU64 Z14, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z15, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z16, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z17, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z18, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z19, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z20, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x7_64_loop
VZEROUPPER
mulGFNI_2x7_64_end:
RET
// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x7(SB), $0-88
// Loading 7 of 14 tables to registers
// Destination kept in GP registers
// Full registers estimated 23 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), SI
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, SI
// Add start offset to input
ADDQ R13, BX
ADDQ R13, DX
mulAvxGFNI_2x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (DI)
ADDQ $0x20, DI
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (R9)
ADDQ $0x20, R9
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x7_loop
VZEROUPPER
mulAvxGFNI_2x7_end:
RET
// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x7_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 23 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), BX
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, BX
// Add start offset to input
ADDQ R12, DX
ADDQ R12, CX
mulGFNI_2x7_64Xor_loop:
// Load 7 outputs
VMOVDQU64 (SI), Z14
VMOVDQU64 (DI), Z15
VMOVDQU64 (R8), Z16
VMOVDQU64 (R9), Z17
VMOVDQU64 (R10), Z18
VMOVDQU64 (R11), Z19
VMOVDQU64 (BX), Z20
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (DX), Z21
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z21, Z22
VXORPD Z14, Z22, Z14
VGF2P8AFFINEQB $0x00, Z1, Z21, Z22
VXORPD Z15, Z22, Z15
VGF2P8AFFINEQB $0x00, Z2, Z21, Z22
VXORPD Z16, Z22, Z16
VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
VXORPD Z17, Z22, Z17
VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (CX), Z21
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
VXORPD Z14, Z22, Z14
VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
VXORPD Z15, Z22, Z15
VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
VXORPD Z16, Z22, Z16
VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
VXORPD Z17, Z22, Z17
VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
VXORPD Z20, Z22, Z20
// Store 7 outputs
VMOVDQU64 Z14, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z15, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z16, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z17, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z18, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z19, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z20, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x7_64Xor_loop
VZEROUPPER
mulGFNI_2x7_64Xor_end:
RET
// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88
// Loading 7 of 14 tables to registers
// Destination kept in GP registers
// Full registers estimated 23 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), SI
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, SI
// Add start offset to input
ADDQ R13, BX
ADDQ R13, DX
mulAvxGFNI_2x7Xor_loop:
// Load 7 outputs
VMOVDQU (DI), Y7
VMOVDQU (R8), Y8
VMOVDQU (R9), Y9
VMOVDQU (R10), Y10
VMOVDQU (R11), Y11
VMOVDQU (R12), Y12
VMOVDQU (SI), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (DI)
ADDQ $0x20, DI
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (R9)
ADDQ $0x20, R9
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x7Xor_loop
VZEROUPPER
mulAvxGFNI_2x7Xor_end:
RET
// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x8_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), BX
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, BX
// Add start offset to input
ADDQ R13, DX
ADDQ R13, CX
mulGFNI_2x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (DX), Z24
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z24, Z16
VGF2P8AFFINEQB $0x00, Z1, Z24, Z17
VGF2P8AFFINEQB $0x00, Z2, Z24, Z18
VGF2P8AFFINEQB $0x00, Z3, Z24, Z19
VGF2P8AFFINEQB $0x00, Z4, Z24, Z20
VGF2P8AFFINEQB $0x00, Z5, Z24, Z21
VGF2P8AFFINEQB $0x00, Z6, Z24, Z22
VGF2P8AFFINEQB $0x00, Z7, Z24, Z23
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (CX), Z24
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
VXORPD Z16, Z25, Z16
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
VXORPD Z17, Z25, Z17
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
VXORPD Z18, Z25, Z18
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
VXORPD Z19, Z25, Z19
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
VXORPD Z23, Z25, Z23
// Store 8 outputs
VMOVDQU64 Z16, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z17, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z18, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z19, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z20, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z21, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z22, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z23, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x8_64_loop
VZEROUPPER
mulGFNI_2x8_64_end:
RET
// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x8(SB), $0-88
// Loading 6 of 16 tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), SI
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, SI
// Add start offset to input
ADDQ R14, BX
ADDQ R14, DX
mulAvxGFNI_2x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (DI)
ADDQ $0x20, DI
VMOVDQU Y7, (R8)
ADDQ $0x20, R8
VMOVDQU Y8, (R9)
ADDQ $0x20, R9
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x8_loop
VZEROUPPER
mulAvxGFNI_2x8_end:
RET
// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x8_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), BX
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, BX
// Add start offset to input
ADDQ R13, DX
ADDQ R13, CX
mulGFNI_2x8_64Xor_loop:
// Load 8 outputs
VMOVDQU64 (SI), Z16
VMOVDQU64 (DI), Z17
VMOVDQU64 (R8), Z18
VMOVDQU64 (R9), Z19
VMOVDQU64 (R10), Z20
VMOVDQU64 (R11), Z21
VMOVDQU64 (R12), Z22
VMOVDQU64 (BX), Z23
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (DX), Z24
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
VXORPD Z16, Z25, Z16
VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
VXORPD Z17, Z25, Z17
VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
VXORPD Z18, Z25, Z18
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
VXORPD Z19, Z25, Z19
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (CX), Z24
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
VXORPD Z16, Z25, Z16
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
VXORPD Z17, Z25, Z17
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
VXORPD Z18, Z25, Z18
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
VXORPD Z19, Z25, Z19
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
VXORPD Z23, Z25, Z23
// Store 8 outputs
VMOVDQU64 Z16, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z17, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z18, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z19, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z20, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z21, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z22, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z23, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x8_64Xor_loop
VZEROUPPER
mulGFNI_2x8_64Xor_end:
RET
// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88
// Loading 6 of 16 tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), SI
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, SI
// Add start offset to input
ADDQ R14, BX
ADDQ R14, DX
mulAvxGFNI_2x8Xor_loop:
// Load 8 outputs
VMOVDQU (DI), Y6
VMOVDQU (R8), Y7
VMOVDQU (R9), Y8
VMOVDQU (R10), Y9
VMOVDQU (R11), Y10
VMOVDQU (R12), Y11
VMOVDQU (R13), Y12
VMOVDQU (SI), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (DI)
ADDQ $0x20, DI
VMOVDQU Y7, (R8)
ADDQ $0x20, R8
VMOVDQU Y8, (R9)
ADDQ $0x20, R9
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x8Xor_loop
VZEROUPPER
mulAvxGFNI_2x8Xor_end:
RET
// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x9_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 29 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), R13
MOVQ 192(BX), BX
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, BX
// Add start offset to input
ADDQ R14, DX
ADDQ R14, CX
mulGFNI_2x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (DX), Z27
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z27, Z18
VGF2P8AFFINEQB $0x00, Z1, Z27, Z19
VGF2P8AFFINEQB $0x00, Z2, Z27, Z20
VGF2P8AFFINEQB $0x00, Z3, Z27, Z21
VGF2P8AFFINEQB $0x00, Z4, Z27, Z22
VGF2P8AFFINEQB $0x00, Z5, Z27, Z23
VGF2P8AFFINEQB $0x00, Z6, Z27, Z24
VGF2P8AFFINEQB $0x00, Z7, Z27, Z25
VGF2P8AFFINEQB $0x00, Z8, Z27, Z26
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (CX), Z27
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
VXORPD Z18, Z28, Z18
VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
VXORPD Z19, Z28, Z19
VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
VXORPD Z20, Z28, Z20
VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
VXORPD Z21, Z28, Z21
VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
VXORPD Z22, Z28, Z22
VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
VXORPD Z23, Z28, Z23
VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
VXORPD Z26, Z28, Z26
// Store 9 outputs
VMOVDQU64 Z18, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z19, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z20, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z21, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z22, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z23, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z24, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z25, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z26, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x9_64_loop
VZEROUPPER
mulGFNI_2x9_64_end:
RET
// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x9(SB), $0-88
// Loading 5 of 18 tables to registers
// Destination kept in GP registers
// Full registers estimated 29 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), R14
MOVQ 192(SI), SI
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, SI
// Add start offset to input
ADDQ R15, BX
ADDQ R15, DX
mulAvxGFNI_2x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
VMOVDQU Y5, (DI)
ADDQ $0x20, DI
VMOVDQU Y6, (R8)
ADDQ $0x20, R8
VMOVDQU Y7, (R9)
ADDQ $0x20, R9
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x9_loop
VZEROUPPER
mulAvxGFNI_2x9_end:
RET
// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x9_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 29 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), R13
MOVQ 192(BX), BX
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, BX
// Add start offset to input
ADDQ R14, DX
ADDQ R14, CX
mulGFNI_2x9_64Xor_loop:
// Load 9 outputs
VMOVDQU64 (SI), Z18
VMOVDQU64 (DI), Z19
VMOVDQU64 (R8), Z20
VMOVDQU64 (R9), Z21
VMOVDQU64 (R10), Z22
VMOVDQU64 (R11), Z23
VMOVDQU64 (R12), Z24
VMOVDQU64 (R13), Z25
VMOVDQU64 (BX), Z26
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (DX), Z27
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z27, Z28
VXORPD Z18, Z28, Z18
VGF2P8AFFINEQB $0x00, Z1, Z27, Z28
VXORPD Z19, Z28, Z19
VGF2P8AFFINEQB $0x00, Z2, Z27, Z28
VXORPD Z20, Z28, Z20
VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
VXORPD Z21, Z28, Z21
VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
VXORPD Z22, Z28, Z22
VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
VXORPD Z23, Z28, Z23
VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (CX), Z27
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
VXORPD Z18, Z28, Z18
VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
VXORPD Z19, Z28, Z19
VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
VXORPD Z20, Z28, Z20
VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
VXORPD Z21, Z28, Z21
VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
VXORPD Z22, Z28, Z22
VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
VXORPD Z23, Z28, Z23
VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
VXORPD Z26, Z28, Z26
// Store 9 outputs
VMOVDQU64 Z18, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z19, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z20, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z21, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z22, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z23, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z24, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z25, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z26, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x9_64Xor_loop
VZEROUPPER
mulGFNI_2x9_64Xor_end:
RET
// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88
// Loading 5 of 18 tables to registers
// Destination kept in GP registers
// Full registers estimated 29 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), R14
MOVQ 192(SI), SI
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, SI
// Add start offset to input
ADDQ R15, BX
ADDQ R15, DX
mulAvxGFNI_2x9Xor_loop:
// Load 9 outputs
VMOVDQU (DI), Y5
VMOVDQU (R8), Y6
VMOVDQU (R9), Y7
VMOVDQU (R10), Y8
VMOVDQU (R11), Y9
VMOVDQU (R12), Y10
VMOVDQU (R13), Y11
VMOVDQU (R14), Y12
VMOVDQU (SI), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
VMOVDQU Y5, (DI)
ADDQ $0x20, DI
VMOVDQU Y6, (R8)
ADDQ $0x20, R8
VMOVDQU Y7, (R9)
ADDQ $0x20, R9
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x9Xor_loop
VZEROUPPER
mulAvxGFNI_2x9Xor_end:
RET
// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x10_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), R13
MOVQ 192(BX), R14
MOVQ 216(BX), BX
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, BX
// Add start offset to input
ADDQ R15, DX
ADDQ R15, CX
mulGFNI_2x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (CX), Z30
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
VMOVDQU64 Z20, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z21, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z22, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z23, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x10_64_loop
VZEROUPPER
mulGFNI_2x10_64_end:
RET
// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x10(SB), $8-88
// Loading 4 of 20 tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), R14
MOVQ 192(SI), R15
MOVQ 216(SI), SI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, SI
// Add start offset to input
ADDQ BP, BX
ADDQ BP, DX
mulAvxGFNI_2x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
VMOVDQU Y4, (DI)
ADDQ $0x20, DI
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x10_loop
VZEROUPPER
mulAvxGFNI_2x10_end:
RET
// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_2x10_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_2x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), CX
MOVQ out_base+48(FP), BX
MOVQ out_base+48(FP), BX
MOVQ (BX), SI
MOVQ 24(BX), DI
MOVQ 48(BX), R8
MOVQ 72(BX), R9
MOVQ 96(BX), R10
MOVQ 120(BX), R11
MOVQ 144(BX), R12
MOVQ 168(BX), R13
MOVQ 192(BX), R14
MOVQ 216(BX), BX
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, BX
// Add start offset to input
ADDQ R15, DX
ADDQ R15, CX
mulGFNI_2x10_64Xor_loop:
// Load 10 outputs
VMOVDQU64 (SI), Z20
VMOVDQU64 (DI), Z21
VMOVDQU64 (R8), Z22
VMOVDQU64 (R9), Z23
VMOVDQU64 (R10), Z24
VMOVDQU64 (R11), Z25
VMOVDQU64 (R12), Z26
VMOVDQU64 (R13), Z27
VMOVDQU64 (R14), Z28
VMOVDQU64 (BX), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (CX), Z30
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
VMOVDQU64 Z20, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z21, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z22, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z23, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (BX)
ADDQ $0x40, BX
// Prepare for next loop
DECQ AX
JNZ mulGFNI_2x10_64Xor_loop
VZEROUPPER
mulGFNI_2x10_64Xor_end:
RET
// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88
// Loading 4 of 20 tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_2x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), DX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), R14
MOVQ 192(SI), R15
MOVQ 216(SI), SI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, SI
// Add start offset to input
ADDQ BP, BX
ADDQ BP, DX
mulAvxGFNI_2x10Xor_loop:
// Load 10 outputs
VMOVDQU (DI), Y4
VMOVDQU (R8), Y5
VMOVDQU (R9), Y6
VMOVDQU (R10), Y7
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (SI), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
VMOVDQU Y4, (DI)
ADDQ $0x20, DI
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_2x10Xor_loop
VZEROUPPER
mulAvxGFNI_2x10Xor_end:
RET
// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 6 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), SI
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, SI
// Add start offset to input
ADDQ DI, DX
ADDQ DI, BX
ADDQ DI, CX
mulGFNI_3x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z4
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z4, Z3
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z4
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z4, Z4
VXORPD Z3, Z4, Z3
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (CX), Z4
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z2, Z4, Z4
VXORPD Z3, Z4, Z3
// Store 1 outputs
VMOVDQU64 Z3, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x1_64_loop
VZEROUPPER
mulGFNI_3x1_64_end:
RET
// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 6 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), SI
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, SI
// Add start offset to input
ADDQ DI, DX
ADDQ DI, BX
ADDQ DI, CX
mulAvxGFNI_3x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y4
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y4, Y3
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y4
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y4, Y4
VXORPD Y3, Y4, Y3
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (CX), Y4
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y2, Y4, Y4
VXORPD Y3, Y4, Y3
// Store 1 outputs
VMOVDQU Y3, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x1_loop
VZEROUPPER
mulAvxGFNI_3x1_end:
RET
// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 6 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), SI
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, SI
// Add start offset to input
ADDQ DI, DX
ADDQ DI, BX
ADDQ DI, CX
mulGFNI_3x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (SI), Z3
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z4
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z4, Z4
VXORPD Z3, Z4, Z3
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z4
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z4, Z4
VXORPD Z3, Z4, Z3
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (CX), Z4
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z2, Z4, Z4
VXORPD Z3, Z4, Z3
// Store 1 outputs
VMOVDQU64 Z3, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x1_64Xor_loop
VZEROUPPER
mulGFNI_3x1_64Xor_end:
RET
// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 6 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), SI
MOVQ start+72(FP), DI
// Add start offset to output
ADDQ DI, SI
// Add start offset to input
ADDQ DI, DX
ADDQ DI, BX
ADDQ DI, CX
mulAvxGFNI_3x1Xor_loop:
// Load 1 outputs
VMOVDQU (SI), Y3
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y4
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y4, Y4
VXORPD Y3, Y4, Y3
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y4
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y4, Y4
VXORPD Y3, Y4, Y3
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (CX), Y4
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y2, Y4, Y4
VXORPD Y3, Y4, Y3
// Store 1 outputs
VMOVDQU Y3, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x1Xor_loop
VZEROUPPER
mulAvxGFNI_3x1Xor_end:
RET
// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), SI
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, DI
ADDQ R8, SI
// Add start offset to input
ADDQ R8, DX
ADDQ R8, BX
ADDQ R8, CX
mulGFNI_3x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z8
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z8, Z6
VGF2P8AFFINEQB $0x00, Z1, Z8, Z7
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z8
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
VXORPD Z6, Z9, Z6
VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
VXORPD Z7, Z9, Z7
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (CX), Z8
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z4, Z8, Z9
VXORPD Z6, Z9, Z6
VGF2P8AFFINEQB $0x00, Z5, Z8, Z9
VXORPD Z7, Z9, Z7
// Store 2 outputs
VMOVDQU64 Z6, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z7, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x2_64_loop
VZEROUPPER
mulGFNI_3x2_64_end:
RET
// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x2(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), SI
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, DI
ADDQ R8, SI
// Add start offset to input
ADDQ R8, DX
ADDQ R8, BX
ADDQ R8, CX
mulAvxGFNI_3x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y8
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y8, Y6
VGF2P8AFFINEQB $0x00, Y1, Y8, Y7
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (BX), Y8
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
VXORPD Y6, Y9, Y6
VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
VXORPD Y7, Y9, Y7
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (CX), Y8
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y4, Y8, Y9
VXORPD Y6, Y9, Y6
VGF2P8AFFINEQB $0x00, Y5, Y8, Y9
VXORPD Y7, Y9, Y7
// Store 2 outputs
VMOVDQU Y6, (DI)
ADDQ $0x20, DI
VMOVDQU Y7, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x2_loop
VZEROUPPER
mulAvxGFNI_3x2_end:
RET
// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), SI
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, DI
ADDQ R8, SI
// Add start offset to input
ADDQ R8, DX
ADDQ R8, BX
ADDQ R8, CX
mulGFNI_3x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (DI), Z6
VMOVDQU64 (SI), Z7
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z8
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z8, Z9
VXORPD Z6, Z9, Z6
VGF2P8AFFINEQB $0x00, Z1, Z8, Z9
VXORPD Z7, Z9, Z7
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z8
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z8, Z9
VXORPD Z6, Z9, Z6
VGF2P8AFFINEQB $0x00, Z3, Z8, Z9
VXORPD Z7, Z9, Z7
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (CX), Z8
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z4, Z8, Z9
VXORPD Z6, Z9, Z6
VGF2P8AFFINEQB $0x00, Z5, Z8, Z9
VXORPD Z7, Z9, Z7
// Store 2 outputs
VMOVDQU64 Z6, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z7, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x2_64Xor_loop
VZEROUPPER
mulGFNI_3x2_64Xor_end:
RET
// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), SI
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, DI
ADDQ R8, SI
// Add start offset to input
ADDQ R8, DX
ADDQ R8, BX
ADDQ R8, CX
mulAvxGFNI_3x2Xor_loop:
// Load 2 outputs
VMOVDQU (DI), Y6
VMOVDQU (SI), Y7
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y8
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y8, Y9
VXORPD Y6, Y9, Y6
VGF2P8AFFINEQB $0x00, Y1, Y8, Y9
VXORPD Y7, Y9, Y7
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (BX), Y8
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y2, Y8, Y9
VXORPD Y6, Y9, Y6
VGF2P8AFFINEQB $0x00, Y3, Y8, Y9
VXORPD Y7, Y9, Y7
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (CX), Y8
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y4, Y8, Y9
VXORPD Y6, Y9, Y6
VGF2P8AFFINEQB $0x00, Y5, Y8, Y9
VXORPD Y7, Y9, Y7
// Store 2 outputs
VMOVDQU Y6, (DI)
ADDQ $0x20, DI
VMOVDQU Y7, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x2Xor_loop
VZEROUPPER
mulAvxGFNI_3x2Xor_end:
RET
// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), SI
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, SI
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, CX
mulGFNI_3x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z12
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z12, Z9
VGF2P8AFFINEQB $0x00, Z1, Z12, Z10
VGF2P8AFFINEQB $0x00, Z2, Z12, Z11
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z12
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (CX), Z12
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
VXORPD Z11, Z13, Z11
// Store 3 outputs
VMOVDQU64 Z9, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z10, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z11, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x3_64_loop
VZEROUPPER
mulGFNI_3x3_64_end:
RET
// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x3(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), SI
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, SI
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, CX
mulAvxGFNI_3x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (DX), Y12
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y12, Y9
VGF2P8AFFINEQB $0x00, Y1, Y12, Y10
VGF2P8AFFINEQB $0x00, Y2, Y12, Y11
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (BX), Y12
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (CX), Y12
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
VXORPD Y11, Y13, Y11
// Store 3 outputs
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x3_loop
VZEROUPPER
mulAvxGFNI_3x3_end:
RET
// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), SI
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, SI
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, CX
mulGFNI_3x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (DI), Z9
VMOVDQU64 (R8), Z10
VMOVDQU64 (SI), Z11
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z12
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z12
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (CX), Z12
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
VXORPD Z9, Z13, Z9
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
VXORPD Z11, Z13, Z11
// Store 3 outputs
VMOVDQU64 Z9, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z10, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z11, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x3_64Xor_loop
VZEROUPPER
mulGFNI_3x3_64Xor_end:
RET
// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), SI
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, DI
ADDQ R9, R8
ADDQ R9, SI
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, CX
mulAvxGFNI_3x3Xor_loop:
// Load 3 outputs
VMOVDQU (DI), Y9
VMOVDQU (R8), Y10
VMOVDQU (SI), Y11
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (DX), Y12
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (BX), Y12
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (CX), Y12
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
VXORPD Y9, Y13, Y9
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
VXORPD Y11, Y13, Y11
// Store 3 outputs
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x3Xor_loop
VZEROUPPER
mulAvxGFNI_3x3Xor_end:
RET
// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x4_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), SI
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, SI
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, CX
mulGFNI_3x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z16
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z16, Z12
VGF2P8AFFINEQB $0x00, Z1, Z16, Z13
VGF2P8AFFINEQB $0x00, Z2, Z16, Z14
VGF2P8AFFINEQB $0x00, Z3, Z16, Z15
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z16
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
VXORPD Z12, Z17, Z12
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
VXORPD Z13, Z17, Z13
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (CX), Z16
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
VXORPD Z12, Z17, Z12
VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
VXORPD Z13, Z17, Z13
VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
VXORPD Z15, Z17, Z15
// Store 4 outputs
VMOVDQU64 Z12, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z13, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z14, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z15, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x4_64_loop
VZEROUPPER
mulGFNI_3x4_64_end:
RET
// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x4(SB), $0-88
// Loading 10 of 12 tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), DI
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, DI
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DX
mulAvxGFNI_3x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (R9)
ADDQ $0x20, R9
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x4_loop
VZEROUPPER
mulAvxGFNI_3x4_end:
RET
// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x4_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), SI
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, SI
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, CX
mulGFNI_3x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (DI), Z12
VMOVDQU64 (R8), Z13
VMOVDQU64 (R9), Z14
VMOVDQU64 (SI), Z15
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z16
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
VXORPD Z12, Z17, Z12
VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
VXORPD Z13, Z17, Z13
VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z16
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
VXORPD Z12, Z17, Z12
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
VXORPD Z13, Z17, Z13
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (CX), Z16
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
VXORPD Z12, Z17, Z12
VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
VXORPD Z13, Z17, Z13
VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
VXORPD Z15, Z17, Z15
// Store 4 outputs
VMOVDQU64 Z12, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z13, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z14, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z15, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x4_64Xor_loop
VZEROUPPER
mulGFNI_3x4_64Xor_end:
RET
// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88
// Loading 10 of 12 tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), DI
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, DI
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DX
mulAvxGFNI_3x4Xor_loop:
// Load 4 outputs
VMOVDQU (R8), Y10
VMOVDQU (R9), Y11
VMOVDQU (R10), Y12
VMOVDQU (DI), Y13
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R8)
ADDQ $0x20, R8
VMOVDQU Y11, (R9)
ADDQ $0x20, R9
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x4Xor_loop
VZEROUPPER
mulAvxGFNI_3x4Xor_end:
RET
// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x5_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), SI
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, SI
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, CX
mulGFNI_3x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z20
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z20, Z15
VGF2P8AFFINEQB $0x00, Z1, Z20, Z16
VGF2P8AFFINEQB $0x00, Z2, Z20, Z17
VGF2P8AFFINEQB $0x00, Z3, Z20, Z18
VGF2P8AFFINEQB $0x00, Z4, Z20, Z19
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (BX), Z20
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
VXORPD Z15, Z21, Z15
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (CX), Z20
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
VXORPD Z15, Z21, Z15
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
VXORPD Z19, Z21, Z19
// Store 5 outputs
VMOVDQU64 Z15, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z16, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z17, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z18, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z19, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x5_64_loop
VZEROUPPER
mulGFNI_3x5_64_end:
RET
// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x5(SB), $0-88
// Loading 9 of 15 tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), DI
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, DI
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DX
mulAvxGFNI_3x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R8)
ADDQ $0x20, R8
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (R10)
ADDQ $0x20, R10
VMOVDQU Y12, (R11)
ADDQ $0x20, R11
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x5_loop
VZEROUPPER
mulAvxGFNI_3x5_end:
RET
// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x5_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), SI
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, SI
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, CX
mulGFNI_3x5_64Xor_loop:
// Load 5 outputs
VMOVDQU64 (DI), Z15
VMOVDQU64 (R8), Z16
VMOVDQU64 (R9), Z17
VMOVDQU64 (R10), Z18
VMOVDQU64 (SI), Z19
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z20
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
VXORPD Z15, Z21, Z15
VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (BX), Z20
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
VXORPD Z15, Z21, Z15
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (CX), Z20
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
VXORPD Z15, Z21, Z15
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
VXORPD Z19, Z21, Z19
// Store 5 outputs
VMOVDQU64 Z15, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z16, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z17, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z18, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z19, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x5_64Xor_loop
VZEROUPPER
mulGFNI_3x5_64Xor_end:
RET
// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88
// Loading 9 of 15 tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), DI
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, DI
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DX
mulAvxGFNI_3x5Xor_loop:
// Load 5 outputs
VMOVDQU (R8), Y9
VMOVDQU (R9), Y10
VMOVDQU (R10), Y11
VMOVDQU (R11), Y12
VMOVDQU (DI), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R8)
ADDQ $0x20, R8
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (R10)
ADDQ $0x20, R10
VMOVDQU Y12, (R11)
ADDQ $0x20, R11
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x5Xor_loop
VZEROUPPER
mulAvxGFNI_3x5Xor_end:
RET
// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x6_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), SI
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, SI
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, CX
mulGFNI_3x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (DX), Z24
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z24, Z18
VGF2P8AFFINEQB $0x00, Z1, Z24, Z19
VGF2P8AFFINEQB $0x00, Z2, Z24, Z20
VGF2P8AFFINEQB $0x00, Z3, Z24, Z21
VGF2P8AFFINEQB $0x00, Z4, Z24, Z22
VGF2P8AFFINEQB $0x00, Z5, Z24, Z23
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (BX), Z24
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
VXORPD Z18, Z25, Z18
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
VXORPD Z19, Z25, Z19
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (CX), Z24
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
VXORPD Z18, Z25, Z18
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
VXORPD Z19, Z25, Z19
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
VXORPD Z23, Z25, Z23
// Store 6 outputs
VMOVDQU64 Z18, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z19, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z20, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z21, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z22, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z23, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x6_64_loop
VZEROUPPER
mulGFNI_3x6_64_end:
RET
// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x6(SB), $0-88
// Loading 8 of 18 tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), DI
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, DI
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DX
mulAvxGFNI_3x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (R9)
ADDQ $0x20, R9
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x6_loop
VZEROUPPER
mulAvxGFNI_3x6_end:
RET
// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x6_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), SI
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, SI
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, CX
mulGFNI_3x6_64Xor_loop:
// Load 6 outputs
VMOVDQU64 (DI), Z18
VMOVDQU64 (R8), Z19
VMOVDQU64 (R9), Z20
VMOVDQU64 (R10), Z21
VMOVDQU64 (R11), Z22
VMOVDQU64 (SI), Z23
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (DX), Z24
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
VXORPD Z18, Z25, Z18
VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
VXORPD Z19, Z25, Z19
VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (BX), Z24
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
VXORPD Z18, Z25, Z18
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
VXORPD Z19, Z25, Z19
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (CX), Z24
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
VXORPD Z18, Z25, Z18
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
VXORPD Z19, Z25, Z19
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
VXORPD Z23, Z25, Z23
// Store 6 outputs
VMOVDQU64 Z18, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z19, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z20, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z21, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z22, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z23, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x6_64Xor_loop
VZEROUPPER
mulGFNI_3x6_64Xor_end:
RET
// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88
// Loading 8 of 18 tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), DI
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, DI
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DX
mulAvxGFNI_3x6Xor_loop:
// Load 6 outputs
VMOVDQU (R8), Y8
VMOVDQU (R9), Y9
VMOVDQU (R10), Y10
VMOVDQU (R11), Y11
VMOVDQU (R12), Y12
VMOVDQU (DI), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (R9)
ADDQ $0x20, R9
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x6Xor_loop
VZEROUPPER
mulAvxGFNI_3x6Xor_end:
RET
// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x7_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 30 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), SI
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, SI
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, CX
mulGFNI_3x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (DX), Z28
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z28, Z21
VGF2P8AFFINEQB $0x00, Z1, Z28, Z22
VGF2P8AFFINEQB $0x00, Z2, Z28, Z23
VGF2P8AFFINEQB $0x00, Z3, Z28, Z24
VGF2P8AFFINEQB $0x00, Z4, Z28, Z25
VGF2P8AFFINEQB $0x00, Z5, Z28, Z26
VGF2P8AFFINEQB $0x00, Z6, Z28, Z27
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (BX), Z28
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
VXORPD Z21, Z29, Z21
VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
VXORPD Z22, Z29, Z22
VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
VXORPD Z23, Z29, Z23
VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (CX), Z28
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
VXORPD Z21, Z29, Z21
VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
VXORPD Z22, Z29, Z22
VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
VXORPD Z23, Z29, Z23
VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
VXORPD Z27, Z29, Z27
// Store 7 outputs
VMOVDQU64 Z21, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z22, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z23, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x7_64_loop
VZEROUPPER
mulGFNI_3x7_64_end:
RET
// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x7(SB), $0-88
// Loading 7 of 21 tables to registers
// Destination kept in GP registers
// Full registers estimated 30 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), DI
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, DI
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DX
mulAvxGFNI_3x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (R8)
ADDQ $0x20, R8
VMOVDQU Y8, (R9)
ADDQ $0x20, R9
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x7_loop
VZEROUPPER
mulAvxGFNI_3x7_end:
RET
// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x7_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 30 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), CX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), SI
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, SI
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, CX
mulGFNI_3x7_64Xor_loop:
// Load 7 outputs
VMOVDQU64 (DI), Z21
VMOVDQU64 (R8), Z22
VMOVDQU64 (R9), Z23
VMOVDQU64 (R10), Z24
VMOVDQU64 (R11), Z25
VMOVDQU64 (R12), Z26
VMOVDQU64 (SI), Z27
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (DX), Z28
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z28, Z29
VXORPD Z21, Z29, Z21
VGF2P8AFFINEQB $0x00, Z1, Z28, Z29
VXORPD Z22, Z29, Z22
VGF2P8AFFINEQB $0x00, Z2, Z28, Z29
VXORPD Z23, Z29, Z23
VGF2P8AFFINEQB $0x00, Z3, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (BX), Z28
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
VXORPD Z21, Z29, Z21
VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
VXORPD Z22, Z29, Z22
VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
VXORPD Z23, Z29, Z23
VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (CX), Z28
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
VXORPD Z21, Z29, Z21
VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
VXORPD Z22, Z29, Z22
VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
VXORPD Z23, Z29, Z23
VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
VXORPD Z27, Z29, Z27
// Store 7 outputs
VMOVDQU64 Z21, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z22, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z23, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x7_64Xor_loop
VZEROUPPER
mulGFNI_3x7_64Xor_end:
RET
// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88
// Loading 7 of 21 tables to registers
// Destination kept in GP registers
// Full registers estimated 30 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), DI
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, DI
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DX
mulAvxGFNI_3x7Xor_loop:
// Load 7 outputs
VMOVDQU (R8), Y7
VMOVDQU (R9), Y8
VMOVDQU (R10), Y9
VMOVDQU (R11), Y10
VMOVDQU (R12), Y11
VMOVDQU (R13), Y12
VMOVDQU (DI), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (R8)
ADDQ $0x20, R8
VMOVDQU Y8, (R9)
ADDQ $0x20, R9
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x7Xor_loop
VZEROUPPER
mulAvxGFNI_3x7Xor_end:
RET
// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x8_64(SB), $0-88
// Loading 22 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 34 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), DI
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, DI
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DX
mulGFNI_3x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
VMOVDQU64 Z22, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z23, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x8_64_loop
VZEROUPPER
mulGFNI_3x8_64_end:
RET
// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x8(SB), $0-88
// Loading 6 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 34 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), DI
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, DI
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DX
mulAvxGFNI_3x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (R8)
ADDQ $0x20, R8
VMOVDQU Y7, (R9)
ADDQ $0x20, R9
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x8_loop
VZEROUPPER
mulAvxGFNI_3x8_end:
RET
// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x8_64Xor(SB), $0-88
// Loading 22 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 34 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), DI
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, DI
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DX
mulGFNI_3x8_64Xor_loop:
// Load 8 outputs
VMOVDQU64 (R8), Z22
VMOVDQU64 (R9), Z23
VMOVDQU64 (R10), Z24
VMOVDQU64 (R11), Z25
VMOVDQU64 (R12), Z26
VMOVDQU64 (R13), Z27
VMOVDQU64 (R14), Z28
VMOVDQU64 (DI), Z29
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
VMOVDQU64 Z22, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z23, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x8_64Xor_loop
VZEROUPPER
mulGFNI_3x8_64Xor_end:
RET
// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88
// Loading 6 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 34 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), DI
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, DI
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DX
mulAvxGFNI_3x8Xor_loop:
// Load 8 outputs
VMOVDQU (R8), Y6
VMOVDQU (R9), Y7
VMOVDQU (R10), Y8
VMOVDQU (R11), Y9
VMOVDQU (R12), Y10
VMOVDQU (R13), Y11
VMOVDQU (R14), Y12
VMOVDQU (DI), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (R8)
ADDQ $0x20, R8
VMOVDQU Y7, (R9)
ADDQ $0x20, R9
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x8Xor_loop
VZEROUPPER
mulAvxGFNI_3x8Xor_end:
RET
// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x9_64(SB), $8-88
// Loading 21 of 27 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), R15
MOVQ 192(DI), DI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, DI
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DX
mulGFNI_3x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
VMOVDQU64 Z21, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x9_64_loop
VZEROUPPER
mulGFNI_3x9_64_end:
RET
// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x9(SB), $8-88
// Loading 5 of 27 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), R15
MOVQ 192(DI), DI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, DI
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DX
mulAvxGFNI_3x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x9_loop
VZEROUPPER
mulAvxGFNI_3x9_end:
RET
// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x9_64Xor(SB), $8-88
// Loading 21 of 27 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), R15
MOVQ 192(DI), DI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, DI
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DX
mulGFNI_3x9_64Xor_loop:
// Load 9 outputs
VMOVDQU64 (R8), Z21
VMOVDQU64 (R9), Z22
VMOVDQU64 (R10), Z23
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (DI), Z29
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
VMOVDQU64 Z21, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_3x9_64Xor_loop
VZEROUPPER
mulGFNI_3x9_64Xor_end:
RET
// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88
// Loading 5 of 27 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), R15
MOVQ 192(DI), DI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, DI
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DX
mulAvxGFNI_3x9Xor_loop:
// Load 9 outputs
VMOVDQU (R8), Y5
VMOVDQU (R9), Y6
VMOVDQU (R10), Y7
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (DI), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_3x9Xor_loop
VZEROUPPER
mulAvxGFNI_3x9Xor_end:
RET
// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x10_64(SB), $8-88
// Loading 20 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), AX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), R14
MOVQ 192(SI), R15
MOVQ 216(SI), SI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, SI
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_3x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
VMOVDQU64 Z20, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z21, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ BP
JNZ mulGFNI_3x10_64_loop
VZEROUPPER
mulGFNI_3x10_64_end:
RET
// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x10(SB), $8-88
// Loading 4 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), AX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), R14
MOVQ 192(SI), R15
MOVQ 216(SI), SI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, SI
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_3x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
VMOVDQU Y4, (DI)
ADDQ $0x20, DI
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_3x10_loop
VZEROUPPER
mulAvxGFNI_3x10_end:
RET
// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_3x10_64Xor(SB), $8-88
// Loading 20 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_3x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), AX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), R14
MOVQ 192(SI), R15
MOVQ 216(SI), SI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, SI
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_3x10_64Xor_loop:
// Load 10 outputs
VMOVDQU64 (DI), Z20
VMOVDQU64 (R8), Z21
VMOVDQU64 (R9), Z22
VMOVDQU64 (R10), Z23
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (SI), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
VMOVDQU64 Z20, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z21, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (SI)
ADDQ $0x40, SI
// Prepare for next loop
DECQ BP
JNZ mulGFNI_3x10_64Xor_loop
VZEROUPPER
mulGFNI_3x10_64Xor_end:
RET
// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88
// Loading 4 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_3x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), AX
MOVQ out_base+48(FP), SI
MOVQ out_base+48(FP), SI
MOVQ (SI), DI
MOVQ 24(SI), R8
MOVQ 48(SI), R9
MOVQ 72(SI), R10
MOVQ 96(SI), R11
MOVQ 120(SI), R12
MOVQ 144(SI), R13
MOVQ 168(SI), R14
MOVQ 192(SI), R15
MOVQ 216(SI), SI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, SI
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_3x10Xor_loop:
// Load 10 outputs
VMOVDQU (DI), Y4
VMOVDQU (R8), Y5
VMOVDQU (R9), Y6
VMOVDQU (R10), Y7
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (SI), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
VMOVDQU Y4, (DI)
ADDQ $0x20, DI
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (SI)
ADDQ $0x20, SI
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_3x10Xor_loop
VZEROUPPER
mulAvxGFNI_3x10Xor_end:
RET
// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 7 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), DI
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, DI
// Add start offset to input
ADDQ R8, DX
ADDQ R8, BX
ADDQ R8, SI
ADDQ R8, CX
mulGFNI_4x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z5
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z5, Z4
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z5
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z5, Z5
VXORPD Z4, Z5, Z4
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z5
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
VXORPD Z4, Z5, Z4
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (CX), Z5
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z3, Z5, Z5
VXORPD Z4, Z5, Z4
// Store 1 outputs
VMOVDQU64 Z4, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x1_64_loop
VZEROUPPER
mulGFNI_4x1_64_end:
RET
// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 7 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), DI
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, DI
// Add start offset to input
ADDQ R8, DX
ADDQ R8, BX
ADDQ R8, SI
ADDQ R8, CX
mulAvxGFNI_4x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y5
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y5, Y4
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y5
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y5, Y5
VXORPD Y4, Y5, Y4
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y5
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
VXORPD Y4, Y5, Y4
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (CX), Y5
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y3, Y5, Y5
VXORPD Y4, Y5, Y4
// Store 1 outputs
VMOVDQU Y4, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x1_loop
VZEROUPPER
mulAvxGFNI_4x1_end:
RET
// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 7 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), DI
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, DI
// Add start offset to input
ADDQ R8, DX
ADDQ R8, BX
ADDQ R8, SI
ADDQ R8, CX
mulGFNI_4x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (DI), Z4
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z5
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z5, Z5
VXORPD Z4, Z5, Z4
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z5
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z5, Z5
VXORPD Z4, Z5, Z4
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z5
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z5, Z5
VXORPD Z4, Z5, Z4
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (CX), Z5
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z3, Z5, Z5
VXORPD Z4, Z5, Z4
// Store 1 outputs
VMOVDQU64 Z4, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x1_64Xor_loop
VZEROUPPER
mulGFNI_4x1_64Xor_end:
RET
// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 7 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), DI
MOVQ start+72(FP), R8
// Add start offset to output
ADDQ R8, DI
// Add start offset to input
ADDQ R8, DX
ADDQ R8, BX
ADDQ R8, SI
ADDQ R8, CX
mulAvxGFNI_4x1Xor_loop:
// Load 1 outputs
VMOVDQU (DI), Y4
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y5
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y5, Y5
VXORPD Y4, Y5, Y4
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y5
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y5, Y5
VXORPD Y4, Y5, Y4
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y5
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y5, Y5
VXORPD Y4, Y5, Y4
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (CX), Y5
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y3, Y5, Y5
VXORPD Y4, Y5, Y4
// Store 1 outputs
VMOVDQU Y4, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x1Xor_loop
VZEROUPPER
mulAvxGFNI_4x1Xor_end:
RET
// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), DI
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, R8
ADDQ R9, DI
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, CX
mulGFNI_4x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z10
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z10, Z8
VGF2P8AFFINEQB $0x00, Z1, Z10, Z9
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z10
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
VXORPD Z8, Z11, Z8
VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
VXORPD Z9, Z11, Z9
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z10
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
VXORPD Z8, Z11, Z8
VGF2P8AFFINEQB $0x00, Z5, Z10, Z11
VXORPD Z9, Z11, Z9
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (CX), Z10
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z6, Z10, Z11
VXORPD Z8, Z11, Z8
VGF2P8AFFINEQB $0x00, Z7, Z10, Z11
VXORPD Z9, Z11, Z9
// Store 2 outputs
VMOVDQU64 Z8, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z9, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x2_64_loop
VZEROUPPER
mulGFNI_4x2_64_end:
RET
// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x2(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), DI
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, R8
ADDQ R9, DI
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, CX
mulAvxGFNI_4x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y10
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y10, Y8
VGF2P8AFFINEQB $0x00, Y1, Y10, Y9
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (BX), Y10
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
VXORPD Y8, Y11, Y8
VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
VXORPD Y9, Y11, Y9
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (SI), Y10
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
VXORPD Y8, Y11, Y8
VGF2P8AFFINEQB $0x00, Y5, Y10, Y11
VXORPD Y9, Y11, Y9
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (CX), Y10
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y6, Y10, Y11
VXORPD Y8, Y11, Y8
VGF2P8AFFINEQB $0x00, Y7, Y10, Y11
VXORPD Y9, Y11, Y9
// Store 2 outputs
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x2_loop
VZEROUPPER
mulAvxGFNI_4x2_end:
RET
// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), DI
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, R8
ADDQ R9, DI
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, CX
mulGFNI_4x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (R8), Z8
VMOVDQU64 (DI), Z9
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z10
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z10, Z11
VXORPD Z8, Z11, Z8
VGF2P8AFFINEQB $0x00, Z1, Z10, Z11
VXORPD Z9, Z11, Z9
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z10
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z10, Z11
VXORPD Z8, Z11, Z8
VGF2P8AFFINEQB $0x00, Z3, Z10, Z11
VXORPD Z9, Z11, Z9
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z10
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z10, Z11
VXORPD Z8, Z11, Z8
VGF2P8AFFINEQB $0x00, Z5, Z10, Z11
VXORPD Z9, Z11, Z9
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (CX), Z10
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z6, Z10, Z11
VXORPD Z8, Z11, Z8
VGF2P8AFFINEQB $0x00, Z7, Z10, Z11
VXORPD Z9, Z11, Z9
// Store 2 outputs
VMOVDQU64 Z8, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z9, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x2_64Xor_loop
VZEROUPPER
mulGFNI_4x2_64Xor_end:
RET
// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), DI
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, R8
ADDQ R9, DI
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, CX
mulAvxGFNI_4x2Xor_loop:
// Load 2 outputs
VMOVDQU (R8), Y8
VMOVDQU (DI), Y9
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y10
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y10, Y11
VXORPD Y8, Y11, Y8
VGF2P8AFFINEQB $0x00, Y1, Y10, Y11
VXORPD Y9, Y11, Y9
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (BX), Y10
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y2, Y10, Y11
VXORPD Y8, Y11, Y8
VGF2P8AFFINEQB $0x00, Y3, Y10, Y11
VXORPD Y9, Y11, Y9
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (SI), Y10
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y10, Y11
VXORPD Y8, Y11, Y8
VGF2P8AFFINEQB $0x00, Y5, Y10, Y11
VXORPD Y9, Y11, Y9
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (CX), Y10
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y6, Y10, Y11
VXORPD Y8, Y11, Y8
VGF2P8AFFINEQB $0x00, Y7, Y10, Y11
VXORPD Y9, Y11, Y9
// Store 2 outputs
VMOVDQU Y8, (R8)
ADDQ $0x20, R8
VMOVDQU Y9, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x2Xor_loop
VZEROUPPER
mulAvxGFNI_4x2Xor_end:
RET
// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 17 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), DI
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, DI
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, CX
mulGFNI_4x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z15
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z15, Z12
VGF2P8AFFINEQB $0x00, Z1, Z15, Z13
VGF2P8AFFINEQB $0x00, Z2, Z15, Z14
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z15
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
VXORPD Z14, Z16, Z14
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z15
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
VXORPD Z14, Z16, Z14
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (CX), Z15
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z10, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z11, Z15, Z16
VXORPD Z14, Z16, Z14
// Store 3 outputs
VMOVDQU64 Z12, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z13, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z14, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x3_64_loop
VZEROUPPER
mulGFNI_4x3_64_end:
RET
// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x3(SB), $0-88
// Loading 11 of 12 tables to registers
// Destination kept in GP registers
// Full registers estimated 17 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R8
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, R8
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, DX
mulAvxGFNI_4x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R9)
ADDQ $0x20, R9
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x3_loop
VZEROUPPER
mulAvxGFNI_4x3_end:
RET
// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 17 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), DI
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R8
ADDQ R10, R9
ADDQ R10, DI
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, CX
mulGFNI_4x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (R8), Z12
VMOVDQU64 (R9), Z13
VMOVDQU64 (DI), Z14
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z15
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z1, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z2, Z15, Z16
VXORPD Z14, Z16, Z14
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z15
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z4, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z5, Z15, Z16
VXORPD Z14, Z16, Z14
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z15
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z7, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z8, Z15, Z16
VXORPD Z14, Z16, Z14
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (CX), Z15
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z9, Z15, Z16
VXORPD Z12, Z16, Z12
VGF2P8AFFINEQB $0x00, Z10, Z15, Z16
VXORPD Z13, Z16, Z13
VGF2P8AFFINEQB $0x00, Z11, Z15, Z16
VXORPD Z14, Z16, Z14
// Store 3 outputs
VMOVDQU64 Z12, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z13, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z14, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x3_64Xor_loop
VZEROUPPER
mulGFNI_4x3_64Xor_end:
RET
// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88
// Loading 11 of 12 tables to registers
// Destination kept in GP registers
// Full registers estimated 17 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R8
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, R8
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, DX
mulAvxGFNI_4x3Xor_loop:
// Load 3 outputs
VMOVDQU (R9), Y11
VMOVDQU (R10), Y12
VMOVDQU (R8), Y13
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R9)
ADDQ $0x20, R9
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x3Xor_loop
VZEROUPPER
mulAvxGFNI_4x3Xor_end:
RET
// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x4_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), DI
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, DI
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, CX
mulGFNI_4x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z20
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z20, Z16
VGF2P8AFFINEQB $0x00, Z1, Z20, Z17
VGF2P8AFFINEQB $0x00, Z2, Z20, Z18
VGF2P8AFFINEQB $0x00, Z3, Z20, Z19
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z20
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (SI), Z20
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (CX), Z20
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
VXORPD Z19, Z21, Z19
// Store 4 outputs
VMOVDQU64 Z16, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z17, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z18, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z19, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x4_64_loop
VZEROUPPER
mulGFNI_4x4_64_end:
RET
// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x4(SB), $0-88
// Loading 10 of 16 tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R8
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, R8
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, DX
mulAvxGFNI_4x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (R10)
ADDQ $0x20, R10
VMOVDQU Y12, (R11)
ADDQ $0x20, R11
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x4_loop
VZEROUPPER
mulAvxGFNI_4x4_end:
RET
// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x4_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), DI
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, DI
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, CX
mulGFNI_4x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (R8), Z16
VMOVDQU64 (R9), Z17
VMOVDQU64 (R10), Z18
VMOVDQU64 (DI), Z19
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z20
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z20
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (SI), Z20
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (CX), Z20
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
VXORPD Z16, Z21, Z16
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
VXORPD Z17, Z21, Z17
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
VXORPD Z19, Z21, Z19
// Store 4 outputs
VMOVDQU64 Z16, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z17, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z18, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z19, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x4_64Xor_loop
VZEROUPPER
mulGFNI_4x4_64Xor_end:
RET
// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88
// Loading 10 of 16 tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R8
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, R8
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, DX
mulAvxGFNI_4x4Xor_loop:
// Load 4 outputs
VMOVDQU (R9), Y10
VMOVDQU (R10), Y11
VMOVDQU (R11), Y12
VMOVDQU (R8), Y13
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (R10)
ADDQ $0x20, R10
VMOVDQU Y12, (R11)
ADDQ $0x20, R11
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x4Xor_loop
VZEROUPPER
mulAvxGFNI_4x4Xor_end:
RET
// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x5_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 27 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), DI
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, DI
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, CX
mulGFNI_4x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z25
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z25, Z20
VGF2P8AFFINEQB $0x00, Z1, Z25, Z21
VGF2P8AFFINEQB $0x00, Z2, Z25, Z22
VGF2P8AFFINEQB $0x00, Z3, Z25, Z23
VGF2P8AFFINEQB $0x00, Z4, Z25, Z24
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (BX), Z25
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z5, Z25, Z26
VXORPD Z20, Z26, Z20
VGF2P8AFFINEQB $0x00, Z6, Z25, Z26
VXORPD Z21, Z26, Z21
VGF2P8AFFINEQB $0x00, Z7, Z25, Z26
VXORPD Z22, Z26, Z22
VGF2P8AFFINEQB $0x00, Z8, Z25, Z26
VXORPD Z23, Z26, Z23
VGF2P8AFFINEQB $0x00, Z9, Z25, Z26
VXORPD Z24, Z26, Z24
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (SI), Z25
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z25, Z26
VXORPD Z20, Z26, Z20
VGF2P8AFFINEQB $0x00, Z11, Z25, Z26
VXORPD Z21, Z26, Z21
VGF2P8AFFINEQB $0x00, Z12, Z25, Z26
VXORPD Z22, Z26, Z22
VGF2P8AFFINEQB $0x00, Z13, Z25, Z26
VXORPD Z23, Z26, Z23
VGF2P8AFFINEQB $0x00, Z14, Z25, Z26
VXORPD Z24, Z26, Z24
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (CX), Z25
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z15, Z25, Z26
VXORPD Z20, Z26, Z20
VGF2P8AFFINEQB $0x00, Z16, Z25, Z26
VXORPD Z21, Z26, Z21
VGF2P8AFFINEQB $0x00, Z17, Z25, Z26
VXORPD Z22, Z26, Z22
VGF2P8AFFINEQB $0x00, Z18, Z25, Z26
VXORPD Z23, Z26, Z23
VGF2P8AFFINEQB $0x00, Z19, Z25, Z26
VXORPD Z24, Z26, Z24
// Store 5 outputs
VMOVDQU64 Z20, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z21, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z22, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z23, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z24, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x5_64_loop
VZEROUPPER
mulGFNI_4x5_64_end:
RET
// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x5(SB), $0-88
// Loading 9 of 20 tables to registers
// Destination kept in GP registers
// Full registers estimated 27 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R8
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R8
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, DX
mulAvxGFNI_4x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R9)
ADDQ $0x20, R9
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x5_loop
VZEROUPPER
mulAvxGFNI_4x5_end:
RET
// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x5_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 27 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), DI
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, DI
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, CX
mulGFNI_4x5_64Xor_loop:
// Load 5 outputs
VMOVDQU64 (R8), Z20
VMOVDQU64 (R9), Z21
VMOVDQU64 (R10), Z22
VMOVDQU64 (R11), Z23
VMOVDQU64 (DI), Z24
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z25
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z25, Z26
VXORPD Z20, Z26, Z20
VGF2P8AFFINEQB $0x00, Z1, Z25, Z26
VXORPD Z21, Z26, Z21
VGF2P8AFFINEQB $0x00, Z2, Z25, Z26
VXORPD Z22, Z26, Z22
VGF2P8AFFINEQB $0x00, Z3, Z25, Z26
VXORPD Z23, Z26, Z23
VGF2P8AFFINEQB $0x00, Z4, Z25, Z26
VXORPD Z24, Z26, Z24
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (BX), Z25
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z5, Z25, Z26
VXORPD Z20, Z26, Z20
VGF2P8AFFINEQB $0x00, Z6, Z25, Z26
VXORPD Z21, Z26, Z21
VGF2P8AFFINEQB $0x00, Z7, Z25, Z26
VXORPD Z22, Z26, Z22
VGF2P8AFFINEQB $0x00, Z8, Z25, Z26
VXORPD Z23, Z26, Z23
VGF2P8AFFINEQB $0x00, Z9, Z25, Z26
VXORPD Z24, Z26, Z24
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (SI), Z25
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z25, Z26
VXORPD Z20, Z26, Z20
VGF2P8AFFINEQB $0x00, Z11, Z25, Z26
VXORPD Z21, Z26, Z21
VGF2P8AFFINEQB $0x00, Z12, Z25, Z26
VXORPD Z22, Z26, Z22
VGF2P8AFFINEQB $0x00, Z13, Z25, Z26
VXORPD Z23, Z26, Z23
VGF2P8AFFINEQB $0x00, Z14, Z25, Z26
VXORPD Z24, Z26, Z24
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (CX), Z25
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z15, Z25, Z26
VXORPD Z20, Z26, Z20
VGF2P8AFFINEQB $0x00, Z16, Z25, Z26
VXORPD Z21, Z26, Z21
VGF2P8AFFINEQB $0x00, Z17, Z25, Z26
VXORPD Z22, Z26, Z22
VGF2P8AFFINEQB $0x00, Z18, Z25, Z26
VXORPD Z23, Z26, Z23
VGF2P8AFFINEQB $0x00, Z19, Z25, Z26
VXORPD Z24, Z26, Z24
// Store 5 outputs
VMOVDQU64 Z20, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z21, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z22, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z23, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z24, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x5_64Xor_loop
VZEROUPPER
mulGFNI_4x5_64Xor_end:
RET
// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88
// Loading 9 of 20 tables to registers
// Destination kept in GP registers
// Full registers estimated 27 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R8
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R8
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, DX
mulAvxGFNI_4x5Xor_loop:
// Load 5 outputs
VMOVDQU (R9), Y9
VMOVDQU (R10), Y10
VMOVDQU (R11), Y11
VMOVDQU (R12), Y12
VMOVDQU (R8), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R9)
ADDQ $0x20, R9
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x5Xor_loop
VZEROUPPER
mulAvxGFNI_4x5Xor_end:
RET
// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x6_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), DI
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, DI
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, CX
mulGFNI_4x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (CX), Z30
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
VMOVDQU64 Z24, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z25, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z26, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z27, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z28, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z29, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x6_64_loop
VZEROUPPER
mulGFNI_4x6_64_end:
RET
// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x6(SB), $0-88
// Loading 8 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R8
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R8
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, DX
mulAvxGFNI_4x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R9)
ADDQ $0x20, R9
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x6_loop
VZEROUPPER
mulAvxGFNI_4x6_end:
RET
// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x6_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), CX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), DI
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, DI
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, CX
mulGFNI_4x6_64Xor_loop:
// Load 6 outputs
VMOVDQU64 (R8), Z24
VMOVDQU64 (R9), Z25
VMOVDQU64 (R10), Z26
VMOVDQU64 (R11), Z27
VMOVDQU64 (R12), Z28
VMOVDQU64 (DI), Z29
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (CX), Z30
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
VMOVDQU64 Z24, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z25, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z26, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z27, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z28, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z29, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x6_64Xor_loop
VZEROUPPER
mulGFNI_4x6_64Xor_end:
RET
// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88
// Loading 8 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R8
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R8
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, DX
mulAvxGFNI_4x6Xor_loop:
// Load 6 outputs
VMOVDQU (R9), Y8
VMOVDQU (R10), Y9
VMOVDQU (R11), Y10
VMOVDQU (R12), Y11
VMOVDQU (R13), Y12
VMOVDQU (R8), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R9)
ADDQ $0x20, R9
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x6Xor_loop
VZEROUPPER
mulAvxGFNI_4x6Xor_end:
RET
// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x7_64(SB), $0-88
// Loading 23 of 28 tables to registers
// Destination kept in GP registers
// Full registers estimated 37 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R8
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R8
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, DX
mulGFNI_4x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
VMOVDQU64 Z23, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x7_64_loop
VZEROUPPER
mulGFNI_4x7_64_end:
RET
// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x7(SB), $0-88
// Loading 7 of 28 tables to registers
// Destination kept in GP registers
// Full registers estimated 37 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R8
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R8
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, DX
mulAvxGFNI_4x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (R9)
ADDQ $0x20, R9
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x7_loop
VZEROUPPER
mulAvxGFNI_4x7_end:
RET
// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x7_64Xor(SB), $0-88
// Loading 23 of 28 tables to registers
// Destination kept in GP registers
// Full registers estimated 37 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R8
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R8
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, DX
mulGFNI_4x7_64Xor_loop:
// Load 7 outputs
VMOVDQU64 (R9), Z23
VMOVDQU64 (R10), Z24
VMOVDQU64 (R11), Z25
VMOVDQU64 (R12), Z26
VMOVDQU64 (R13), Z27
VMOVDQU64 (R14), Z28
VMOVDQU64 (R8), Z29
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
VMOVDQU64 Z23, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x7_64Xor_loop
VZEROUPPER
mulGFNI_4x7_64Xor_end:
RET
// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88
// Loading 7 of 28 tables to registers
// Destination kept in GP registers
// Full registers estimated 37 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R8
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R8
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, DX
mulAvxGFNI_4x7Xor_loop:
// Load 7 outputs
VMOVDQU (R9), Y7
VMOVDQU (R10), Y8
VMOVDQU (R11), Y9
VMOVDQU (R12), Y10
VMOVDQU (R13), Y11
VMOVDQU (R14), Y12
VMOVDQU (R8), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (R9)
ADDQ $0x20, R9
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x7Xor_loop
VZEROUPPER
mulAvxGFNI_4x7Xor_end:
RET
// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x8_64(SB), $8-88
// Loading 22 of 32 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R15
MOVQ 168(R8), R8
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R8
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, DX
mulGFNI_4x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x8_64_loop
VZEROUPPER
mulGFNI_4x8_64_end:
RET
// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x8(SB), $8-88
// Loading 6 of 32 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R15
MOVQ 168(R8), R8
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R8
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, DX
mulAvxGFNI_4x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x8_loop
VZEROUPPER
mulAvxGFNI_4x8_end:
RET
// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x8_64Xor(SB), $8-88
// Loading 22 of 32 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R15
MOVQ 168(R8), R8
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R8
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, DX
mulGFNI_4x8_64Xor_loop:
// Load 8 outputs
VMOVDQU64 (R9), Z22
VMOVDQU64 (R10), Z23
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R8), Z29
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_4x8_64Xor_loop
VZEROUPPER
mulGFNI_4x8_64Xor_end:
RET
// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88
// Loading 6 of 32 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R15
MOVQ 168(R8), R8
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R8
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, DX
mulAvxGFNI_4x8Xor_loop:
// Load 8 outputs
VMOVDQU (R9), Y6
VMOVDQU (R10), Y7
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R8), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_4x8Xor_loop
VZEROUPPER
mulAvxGFNI_4x8Xor_end:
RET
// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x9_64(SB), $8-88
// Loading 21 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 47 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), AX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), R15
MOVQ 192(DI), DI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, DI
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_4x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
VMOVDQU64 Z21, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ BP
JNZ mulGFNI_4x9_64_loop
VZEROUPPER
mulGFNI_4x9_64_end:
RET
// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x9(SB), $8-88
// Loading 5 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 47 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), AX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), R15
MOVQ 192(DI), DI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, DI
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_4x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_4x9_loop
VZEROUPPER
mulAvxGFNI_4x9_end:
RET
// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x9_64Xor(SB), $8-88
// Loading 21 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 47 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), AX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), R15
MOVQ 192(DI), DI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, DI
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_4x9_64Xor_loop:
// Load 9 outputs
VMOVDQU64 (R8), Z21
VMOVDQU64 (R9), Z22
VMOVDQU64 (R10), Z23
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (DI), Z29
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
VMOVDQU64 Z21, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (DI)
ADDQ $0x40, DI
// Prepare for next loop
DECQ BP
JNZ mulGFNI_4x9_64Xor_loop
VZEROUPPER
mulGFNI_4x9_64Xor_end:
RET
// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88
// Loading 5 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 47 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), AX
MOVQ out_base+48(FP), DI
MOVQ out_base+48(FP), DI
MOVQ (DI), R8
MOVQ 24(DI), R9
MOVQ 48(DI), R10
MOVQ 72(DI), R11
MOVQ 96(DI), R12
MOVQ 120(DI), R13
MOVQ 144(DI), R14
MOVQ 168(DI), R15
MOVQ 192(DI), DI
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, DI
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_4x9Xor_loop:
// Load 9 outputs
VMOVDQU (R8), Y5
VMOVDQU (R9), Y6
VMOVDQU (R10), Y7
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (DI), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (DI)
ADDQ $0x20, DI
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_4x9Xor_loop
VZEROUPPER
mulAvxGFNI_4x9Xor_end:
RET
// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x10_64(SB), $0-88
// Loading 20 of 40 tables to registers
// Destination kept on stack
// Full registers estimated 52 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ start+72(FP), R9
// Add start offset to input
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, DX
mulGFNI_4x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R8), R10
VMOVDQU64 Z20, (R10)(R9*1)
MOVQ 24(R8), R10
VMOVDQU64 Z21, (R10)(R9*1)
MOVQ 48(R8), R10
VMOVDQU64 Z22, (R10)(R9*1)
MOVQ 72(R8), R10
VMOVDQU64 Z23, (R10)(R9*1)
MOVQ 96(R8), R10
VMOVDQU64 Z24, (R10)(R9*1)
MOVQ 120(R8), R10
VMOVDQU64 Z25, (R10)(R9*1)
MOVQ 144(R8), R10
VMOVDQU64 Z26, (R10)(R9*1)
MOVQ 168(R8), R10
VMOVDQU64 Z27, (R10)(R9*1)
MOVQ 192(R8), R10
VMOVDQU64 Z28, (R10)(R9*1)
MOVQ 216(R8), R10
VMOVDQU64 Z29, (R10)(R9*1)
// Prepare for next loop
ADDQ $0x40, R9
DECQ AX
JNZ mulGFNI_4x10_64_loop
VZEROUPPER
mulGFNI_4x10_64_end:
RET
// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x10(SB), $0-88
// Loading 4 of 40 tables to registers
// Destination kept on stack
// Full registers estimated 52 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ start+72(FP), R9
// Add start offset to input
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, DX
mulAvxGFNI_4x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R8), R10
VMOVDQU Y4, (R10)(R9*1)
MOVQ 24(R8), R10
VMOVDQU Y5, (R10)(R9*1)
MOVQ 48(R8), R10
VMOVDQU Y6, (R10)(R9*1)
MOVQ 72(R8), R10
VMOVDQU Y7, (R10)(R9*1)
MOVQ 96(R8), R10
VMOVDQU Y8, (R10)(R9*1)
MOVQ 120(R8), R10
VMOVDQU Y9, (R10)(R9*1)
MOVQ 144(R8), R10
VMOVDQU Y10, (R10)(R9*1)
MOVQ 168(R8), R10
VMOVDQU Y11, (R10)(R9*1)
MOVQ 192(R8), R10
VMOVDQU Y12, (R10)(R9*1)
MOVQ 216(R8), R10
VMOVDQU Y13, (R10)(R9*1)
// Prepare for next loop
ADDQ $0x20, R9
DECQ AX
JNZ mulAvxGFNI_4x10_loop
VZEROUPPER
mulAvxGFNI_4x10_end:
RET
// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_4x10_64Xor(SB), $0-88
// Loading 20 of 40 tables to registers
// Destination kept on stack
// Full registers estimated 52 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_4x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ start+72(FP), R9
// Add start offset to input
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, DX
mulGFNI_4x10_64Xor_loop:
// Load 10 outputs
MOVQ (R8), R10
VMOVDQU64 (R10)(R9*1), Z20
MOVQ 24(R8), R10
VMOVDQU64 (R10)(R9*1), Z21
MOVQ 48(R8), R10
VMOVDQU64 (R10)(R9*1), Z22
MOVQ 72(R8), R10
VMOVDQU64 (R10)(R9*1), Z23
MOVQ 96(R8), R10
VMOVDQU64 (R10)(R9*1), Z24
MOVQ 120(R8), R10
VMOVDQU64 (R10)(R9*1), Z25
MOVQ 144(R8), R10
VMOVDQU64 (R10)(R9*1), Z26
MOVQ 168(R8), R10
VMOVDQU64 (R10)(R9*1), Z27
MOVQ 192(R8), R10
VMOVDQU64 (R10)(R9*1), Z28
MOVQ 216(R8), R10
VMOVDQU64 (R10)(R9*1), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R8), R10
VMOVDQU64 Z20, (R10)(R9*1)
MOVQ 24(R8), R10
VMOVDQU64 Z21, (R10)(R9*1)
MOVQ 48(R8), R10
VMOVDQU64 Z22, (R10)(R9*1)
MOVQ 72(R8), R10
VMOVDQU64 Z23, (R10)(R9*1)
MOVQ 96(R8), R10
VMOVDQU64 Z24, (R10)(R9*1)
MOVQ 120(R8), R10
VMOVDQU64 Z25, (R10)(R9*1)
MOVQ 144(R8), R10
VMOVDQU64 Z26, (R10)(R9*1)
MOVQ 168(R8), R10
VMOVDQU64 Z27, (R10)(R9*1)
MOVQ 192(R8), R10
VMOVDQU64 Z28, (R10)(R9*1)
MOVQ 216(R8), R10
VMOVDQU64 Z29, (R10)(R9*1)
// Prepare for next loop
ADDQ $0x40, R9
DECQ AX
JNZ mulGFNI_4x10_64Xor_loop
VZEROUPPER
mulGFNI_4x10_64Xor_end:
RET
// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88
// Loading 4 of 40 tables to registers
// Destination kept on stack
// Full registers estimated 52 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_4x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), DX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ start+72(FP), R9
// Add start offset to input
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, DX
mulAvxGFNI_4x10Xor_loop:
// Load 10 outputs
MOVQ (R8), R10
VMOVDQU (R10)(R9*1), Y4
MOVQ 24(R8), R10
VMOVDQU (R10)(R9*1), Y5
MOVQ 48(R8), R10
VMOVDQU (R10)(R9*1), Y6
MOVQ 72(R8), R10
VMOVDQU (R10)(R9*1), Y7
MOVQ 96(R8), R10
VMOVDQU (R10)(R9*1), Y8
MOVQ 120(R8), R10
VMOVDQU (R10)(R9*1), Y9
MOVQ 144(R8), R10
VMOVDQU (R10)(R9*1), Y10
MOVQ 168(R8), R10
VMOVDQU (R10)(R9*1), Y11
MOVQ 192(R8), R10
VMOVDQU (R10)(R9*1), Y12
MOVQ 216(R8), R10
VMOVDQU (R10)(R9*1), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R8), R10
VMOVDQU Y4, (R10)(R9*1)
MOVQ 24(R8), R10
VMOVDQU Y5, (R10)(R9*1)
MOVQ 48(R8), R10
VMOVDQU Y6, (R10)(R9*1)
MOVQ 72(R8), R10
VMOVDQU Y7, (R10)(R9*1)
MOVQ 96(R8), R10
VMOVDQU Y8, (R10)(R9*1)
MOVQ 120(R8), R10
VMOVDQU Y9, (R10)(R9*1)
MOVQ 144(R8), R10
VMOVDQU Y10, (R10)(R9*1)
MOVQ 168(R8), R10
VMOVDQU Y11, (R10)(R9*1)
MOVQ 192(R8), R10
VMOVDQU Y12, (R10)(R9*1)
MOVQ 216(R8), R10
VMOVDQU Y13, (R10)(R9*1)
// Prepare for next loop
ADDQ $0x20, R9
DECQ AX
JNZ mulAvxGFNI_4x10Xor_loop
VZEROUPPER
mulAvxGFNI_4x10Xor_end:
RET
// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R8
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, R8
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, CX
mulGFNI_5x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z6
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z6, Z5
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z6
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z6, Z6
VXORPD Z5, Z6, Z5
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z6
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z6, Z6
VXORPD Z5, Z6, Z5
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z6
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z6, Z6
VXORPD Z5, Z6, Z5
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (CX), Z6
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z4, Z6, Z6
VXORPD Z5, Z6, Z5
// Store 1 outputs
VMOVDQU64 Z5, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x1_64_loop
VZEROUPPER
mulGFNI_5x1_64_end:
RET
// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R8
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, R8
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, CX
mulAvxGFNI_5x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y6
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y6, Y5
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y6
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y6, Y6
VXORPD Y5, Y6, Y5
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y6
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y6, Y6
VXORPD Y5, Y6, Y5
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y6
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y6, Y6
VXORPD Y5, Y6, Y5
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (CX), Y6
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y4, Y6, Y6
VXORPD Y5, Y6, Y5
// Store 1 outputs
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x1_loop
VZEROUPPER
mulAvxGFNI_5x1_end:
RET
// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R8
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, R8
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, CX
mulGFNI_5x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (R8), Z5
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z6
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z6, Z6
VXORPD Z5, Z6, Z5
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z6
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z6, Z6
VXORPD Z5, Z6, Z5
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z6
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z6, Z6
VXORPD Z5, Z6, Z5
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z6
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z6, Z6
VXORPD Z5, Z6, Z5
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (CX), Z6
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z4, Z6, Z6
VXORPD Z5, Z6, Z5
// Store 1 outputs
VMOVDQU64 Z5, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x1_64Xor_loop
VZEROUPPER
mulGFNI_5x1_64Xor_end:
RET
// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 8 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R8
MOVQ start+72(FP), R9
// Add start offset to output
ADDQ R9, R8
// Add start offset to input
ADDQ R9, DX
ADDQ R9, BX
ADDQ R9, SI
ADDQ R9, DI
ADDQ R9, CX
mulAvxGFNI_5x1Xor_loop:
// Load 1 outputs
VMOVDQU (R8), Y5
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y6
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y6, Y6
VXORPD Y5, Y6, Y5
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y6
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y6, Y6
VXORPD Y5, Y6, Y5
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y6
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y6, Y6
VXORPD Y5, Y6, Y5
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y6
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y6, Y6
VXORPD Y5, Y6, Y5
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (CX), Y6
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y4, Y6, Y6
VXORPD Y5, Y6, Y5
// Store 1 outputs
VMOVDQU Y5, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x1Xor_loop
VZEROUPPER
mulAvxGFNI_5x1Xor_end:
RET
// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R8
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R9
ADDQ R10, R8
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, CX
mulGFNI_5x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z12
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z12, Z10
VGF2P8AFFINEQB $0x00, Z1, Z12, Z11
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z12
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z12
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z12
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (CX), Z12
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z9, Z12, Z13
VXORPD Z11, Z13, Z11
// Store 2 outputs
VMOVDQU64 Z10, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z11, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x2_64_loop
VZEROUPPER
mulGFNI_5x2_64_end:
RET
// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x2(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R8
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R9
ADDQ R10, R8
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, CX
mulAvxGFNI_5x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y12
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y12, Y10
VGF2P8AFFINEQB $0x00, Y1, Y12, Y11
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (BX), Y12
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (SI), Y12
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (DI), Y12
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (CX), Y12
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y9, Y12, Y13
VXORPD Y11, Y13, Y11
// Store 2 outputs
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x2_loop
VZEROUPPER
mulAvxGFNI_5x2_end:
RET
// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R8
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R9
ADDQ R10, R8
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, CX
mulGFNI_5x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (R9), Z10
VMOVDQU64 (R8), Z11
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z12
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z1, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z12
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z3, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z12
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z5, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z12
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z7, Z12, Z13
VXORPD Z11, Z13, Z11
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (CX), Z12
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z8, Z12, Z13
VXORPD Z10, Z13, Z10
VGF2P8AFFINEQB $0x00, Z9, Z12, Z13
VXORPD Z11, Z13, Z11
// Store 2 outputs
VMOVDQU64 Z10, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z11, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x2_64Xor_loop
VZEROUPPER
mulGFNI_5x2_64Xor_end:
RET
// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 14 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R8
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R9
ADDQ R10, R8
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, CX
mulAvxGFNI_5x2Xor_loop:
// Load 2 outputs
VMOVDQU (R9), Y10
VMOVDQU (R8), Y11
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y12
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y1, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (BX), Y12
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y2, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y3, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (SI), Y12
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y5, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (DI), Y12
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y7, Y12, Y13
VXORPD Y11, Y13, Y11
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (CX), Y12
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y8, Y12, Y13
VXORPD Y10, Y13, Y10
VGF2P8AFFINEQB $0x00, Y9, Y12, Y13
VXORPD Y11, Y13, Y11
// Store 2 outputs
VMOVDQU Y10, (R9)
ADDQ $0x20, R9
VMOVDQU Y11, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x2Xor_loop
VZEROUPPER
mulAvxGFNI_5x2Xor_end:
RET
// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R8
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, R8
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, CX
mulGFNI_5x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z18
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z18, Z15
VGF2P8AFFINEQB $0x00, Z1, Z18, Z16
VGF2P8AFFINEQB $0x00, Z2, Z18, Z17
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z18
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z18
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z18
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (CX), Z18
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
VXORPD Z17, Z19, Z17
// Store 3 outputs
VMOVDQU64 Z15, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z16, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z17, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x3_64_loop
VZEROUPPER
mulGFNI_5x3_64_end:
RET
// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x3(SB), $0-88
// Loading 11 of 15 tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R9
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, R9
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, DX
mulAvxGFNI_5x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R10)
ADDQ $0x20, R10
VMOVDQU Y12, (R11)
ADDQ $0x20, R11
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x3_loop
VZEROUPPER
mulAvxGFNI_5x3_end:
RET
// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R8
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R9
ADDQ R11, R10
ADDQ R11, R8
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, CX
mulGFNI_5x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (R9), Z15
VMOVDQU64 (R10), Z16
VMOVDQU64 (R8), Z17
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z18
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z18
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z18
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z18
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (CX), Z18
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
VXORPD Z15, Z19, Z15
VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
VXORPD Z17, Z19, Z17
// Store 3 outputs
VMOVDQU64 Z15, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z16, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z17, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x3_64Xor_loop
VZEROUPPER
mulGFNI_5x3_64Xor_end:
RET
// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88
// Loading 11 of 15 tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R9
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, R9
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, DX
mulAvxGFNI_5x3Xor_loop:
// Load 3 outputs
VMOVDQU (R10), Y11
VMOVDQU (R11), Y12
VMOVDQU (R9), Y13
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R10)
ADDQ $0x20, R10
VMOVDQU Y12, (R11)
ADDQ $0x20, R11
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x3Xor_loop
VZEROUPPER
mulAvxGFNI_5x3Xor_end:
RET
// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x4_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R8
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, R8
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, CX
mulGFNI_5x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z24
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z24, Z20
VGF2P8AFFINEQB $0x00, Z1, Z24, Z21
VGF2P8AFFINEQB $0x00, Z2, Z24, Z22
VGF2P8AFFINEQB $0x00, Z3, Z24, Z23
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z24
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (SI), Z24
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (DI), Z24
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (CX), Z24
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
VXORPD Z23, Z25, Z23
// Store 4 outputs
VMOVDQU64 Z20, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z21, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z22, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z23, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x4_64_loop
VZEROUPPER
mulGFNI_5x4_64_end:
RET
// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x4(SB), $0-88
// Loading 10 of 20 tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R9
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R9
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, DX
mulAvxGFNI_5x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x4_loop
VZEROUPPER
mulAvxGFNI_5x4_end:
RET
// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x4_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R8
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, R8
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, CX
mulGFNI_5x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (R9), Z20
VMOVDQU64 (R10), Z21
VMOVDQU64 (R11), Z22
VMOVDQU64 (R8), Z23
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z24
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z24
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (SI), Z24
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (DI), Z24
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (CX), Z24
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
VXORPD Z20, Z25, Z20
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
VXORPD Z23, Z25, Z23
// Store 4 outputs
VMOVDQU64 Z20, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z21, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z22, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z23, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x4_64Xor_loop
VZEROUPPER
mulGFNI_5x4_64Xor_end:
RET
// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88
// Loading 10 of 20 tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R9
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R9
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, DX
mulAvxGFNI_5x4Xor_loop:
// Load 4 outputs
VMOVDQU (R10), Y10
VMOVDQU (R11), Y11
VMOVDQU (R12), Y12
VMOVDQU (R9), Y13
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R10)
ADDQ $0x20, R10
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x4Xor_loop
VZEROUPPER
mulAvxGFNI_5x4Xor_end:
RET
// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x5_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R8
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R8
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, CX
mulGFNI_5x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (CX), Z30
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
VMOVDQU64 Z25, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z26, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z27, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z28, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z29, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x5_64_loop
VZEROUPPER
mulGFNI_5x5_64_end:
RET
// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x5(SB), $0-88
// Loading 9 of 25 tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R9
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R9
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, DX
mulAvxGFNI_5x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x5_loop
VZEROUPPER
mulAvxGFNI_5x5_end:
RET
// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x5_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), CX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R8
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R8
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, CX
mulGFNI_5x5_64Xor_loop:
// Load 5 outputs
VMOVDQU64 (R9), Z25
VMOVDQU64 (R10), Z26
VMOVDQU64 (R11), Z27
VMOVDQU64 (R12), Z28
VMOVDQU64 (R8), Z29
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (CX), Z30
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
VMOVDQU64 Z25, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z26, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z27, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z28, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z29, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x5_64Xor_loop
VZEROUPPER
mulGFNI_5x5_64Xor_end:
RET
// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88
// Loading 9 of 25 tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R9
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R9
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, DX
mulAvxGFNI_5x5Xor_loop:
// Load 5 outputs
VMOVDQU (R10), Y9
VMOVDQU (R11), Y10
VMOVDQU (R12), Y11
VMOVDQU (R13), Y12
VMOVDQU (R9), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R10)
ADDQ $0x20, R10
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x5Xor_loop
VZEROUPPER
mulAvxGFNI_5x5Xor_end:
RET
// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x6_64(SB), $0-88
// Loading 24 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R9
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R9
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, DX
mulGFNI_5x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x6_64_loop
VZEROUPPER
mulGFNI_5x6_64_end:
RET
// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x6(SB), $0-88
// Loading 8 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R9
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R9
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, DX
mulAvxGFNI_5x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x6_loop
VZEROUPPER
mulAvxGFNI_5x6_end:
RET
// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x6_64Xor(SB), $0-88
// Loading 24 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R9
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R9
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, DX
mulGFNI_5x6_64Xor_loop:
// Load 6 outputs
VMOVDQU64 (R10), Z24
VMOVDQU64 (R11), Z25
VMOVDQU64 (R12), Z26
VMOVDQU64 (R13), Z27
VMOVDQU64 (R14), Z28
VMOVDQU64 (R9), Z29
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x6_64Xor_loop
VZEROUPPER
mulGFNI_5x6_64Xor_end:
RET
// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88
// Loading 8 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R9
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R9
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, DX
mulAvxGFNI_5x6Xor_loop:
// Load 6 outputs
VMOVDQU (R10), Y8
VMOVDQU (R11), Y9
VMOVDQU (R12), Y10
VMOVDQU (R13), Y11
VMOVDQU (R14), Y12
VMOVDQU (R9), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R10)
ADDQ $0x20, R10
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x6Xor_loop
VZEROUPPER
mulAvxGFNI_5x6Xor_end:
RET
// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x7_64(SB), $8-88
// Loading 23 of 35 tables to registers
// Destination kept in GP registers
// Full registers estimated 44 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R15
MOVQ 144(R9), R9
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R9
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, DX
mulGFNI_5x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x7_64_loop
VZEROUPPER
mulGFNI_5x7_64_end:
RET
// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x7(SB), $8-88
// Loading 7 of 35 tables to registers
// Destination kept in GP registers
// Full registers estimated 44 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R15
MOVQ 144(R9), R9
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R9
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, DX
mulAvxGFNI_5x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x7_loop
VZEROUPPER
mulAvxGFNI_5x7_end:
RET
// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x7_64Xor(SB), $8-88
// Loading 23 of 35 tables to registers
// Destination kept in GP registers
// Full registers estimated 44 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R15
MOVQ 144(R9), R9
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R9
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, DX
mulGFNI_5x7_64Xor_loop:
// Load 7 outputs
VMOVDQU64 (R10), Z23
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R9), Z29
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_5x7_64Xor_loop
VZEROUPPER
mulGFNI_5x7_64Xor_end:
RET
// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88
// Loading 7 of 35 tables to registers
// Destination kept in GP registers
// Full registers estimated 44 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R15
MOVQ 144(R9), R9
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R9
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, DX
mulAvxGFNI_5x7Xor_loop:
// Load 7 outputs
VMOVDQU (R10), Y7
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R9), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_5x7Xor_loop
VZEROUPPER
mulAvxGFNI_5x7Xor_end:
RET
// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x8_64(SB), $8-88
// Loading 22 of 40 tables to registers
// Destination kept in GP registers
// Full registers estimated 50 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), AX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R15
MOVQ 168(R8), R8
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R8
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_5x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ BP
JNZ mulGFNI_5x8_64_loop
VZEROUPPER
mulGFNI_5x8_64_end:
RET
// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x8(SB), $8-88
// Loading 6 of 40 tables to registers
// Destination kept in GP registers
// Full registers estimated 50 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), AX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R15
MOVQ 168(R8), R8
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R8
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_5x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_5x8_loop
VZEROUPPER
mulAvxGFNI_5x8_end:
RET
// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x8_64Xor(SB), $8-88
// Loading 22 of 40 tables to registers
// Destination kept in GP registers
// Full registers estimated 50 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), AX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R15
MOVQ 168(R8), R8
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R8
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_5x8_64Xor_loop:
// Load 8 outputs
VMOVDQU64 (R9), Z22
VMOVDQU64 (R10), Z23
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R8), Z29
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
VMOVDQU64 Z22, (R9)
ADDQ $0x40, R9
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R8)
ADDQ $0x40, R8
// Prepare for next loop
DECQ BP
JNZ mulGFNI_5x8_64Xor_loop
VZEROUPPER
mulGFNI_5x8_64Xor_end:
RET
// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88
// Loading 6 of 40 tables to registers
// Destination kept in GP registers
// Full registers estimated 50 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), AX
MOVQ out_base+48(FP), R8
MOVQ out_base+48(FP), R8
MOVQ (R8), R9
MOVQ 24(R8), R10
MOVQ 48(R8), R11
MOVQ 72(R8), R12
MOVQ 96(R8), R13
MOVQ 120(R8), R14
MOVQ 144(R8), R15
MOVQ 168(R8), R8
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R8
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_5x8Xor_loop:
// Load 8 outputs
VMOVDQU (R9), Y6
VMOVDQU (R10), Y7
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R8), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R8)
ADDQ $0x20, R8
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_5x8Xor_loop
VZEROUPPER
mulAvxGFNI_5x8Xor_end:
RET
// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x9_64(SB), $0-88
// Loading 21 of 45 tables to registers
// Destination kept on stack
// Full registers estimated 56 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ start+72(FP), R10
// Add start offset to input
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, DX
mulGFNI_5x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R9), R11
VMOVDQU64 Z21, (R11)(R10*1)
MOVQ 24(R9), R11
VMOVDQU64 Z22, (R11)(R10*1)
MOVQ 48(R9), R11
VMOVDQU64 Z23, (R11)(R10*1)
MOVQ 72(R9), R11
VMOVDQU64 Z24, (R11)(R10*1)
MOVQ 96(R9), R11
VMOVDQU64 Z25, (R11)(R10*1)
MOVQ 120(R9), R11
VMOVDQU64 Z26, (R11)(R10*1)
MOVQ 144(R9), R11
VMOVDQU64 Z27, (R11)(R10*1)
MOVQ 168(R9), R11
VMOVDQU64 Z28, (R11)(R10*1)
MOVQ 192(R9), R11
VMOVDQU64 Z29, (R11)(R10*1)
// Prepare for next loop
ADDQ $0x40, R10
DECQ AX
JNZ mulGFNI_5x9_64_loop
VZEROUPPER
mulGFNI_5x9_64_end:
RET
// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x9(SB), $0-88
// Loading 5 of 45 tables to registers
// Destination kept on stack
// Full registers estimated 56 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ start+72(FP), R10
// Add start offset to input
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, DX
mulAvxGFNI_5x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R9), R11
VMOVDQU Y5, (R11)(R10*1)
MOVQ 24(R9), R11
VMOVDQU Y6, (R11)(R10*1)
MOVQ 48(R9), R11
VMOVDQU Y7, (R11)(R10*1)
MOVQ 72(R9), R11
VMOVDQU Y8, (R11)(R10*1)
MOVQ 96(R9), R11
VMOVDQU Y9, (R11)(R10*1)
MOVQ 120(R9), R11
VMOVDQU Y10, (R11)(R10*1)
MOVQ 144(R9), R11
VMOVDQU Y11, (R11)(R10*1)
MOVQ 168(R9), R11
VMOVDQU Y12, (R11)(R10*1)
MOVQ 192(R9), R11
VMOVDQU Y13, (R11)(R10*1)
// Prepare for next loop
ADDQ $0x20, R10
DECQ AX
JNZ mulAvxGFNI_5x9_loop
VZEROUPPER
mulAvxGFNI_5x9_end:
RET
// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x9_64Xor(SB), $0-88
// Loading 21 of 45 tables to registers
// Destination kept on stack
// Full registers estimated 56 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ start+72(FP), R10
// Add start offset to input
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, DX
mulGFNI_5x9_64Xor_loop:
// Load 9 outputs
MOVQ (R9), R11
VMOVDQU64 (R11)(R10*1), Z21
MOVQ 24(R9), R11
VMOVDQU64 (R11)(R10*1), Z22
MOVQ 48(R9), R11
VMOVDQU64 (R11)(R10*1), Z23
MOVQ 72(R9), R11
VMOVDQU64 (R11)(R10*1), Z24
MOVQ 96(R9), R11
VMOVDQU64 (R11)(R10*1), Z25
MOVQ 120(R9), R11
VMOVDQU64 (R11)(R10*1), Z26
MOVQ 144(R9), R11
VMOVDQU64 (R11)(R10*1), Z27
MOVQ 168(R9), R11
VMOVDQU64 (R11)(R10*1), Z28
MOVQ 192(R9), R11
VMOVDQU64 (R11)(R10*1), Z29
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R9), R11
VMOVDQU64 Z21, (R11)(R10*1)
MOVQ 24(R9), R11
VMOVDQU64 Z22, (R11)(R10*1)
MOVQ 48(R9), R11
VMOVDQU64 Z23, (R11)(R10*1)
MOVQ 72(R9), R11
VMOVDQU64 Z24, (R11)(R10*1)
MOVQ 96(R9), R11
VMOVDQU64 Z25, (R11)(R10*1)
MOVQ 120(R9), R11
VMOVDQU64 Z26, (R11)(R10*1)
MOVQ 144(R9), R11
VMOVDQU64 Z27, (R11)(R10*1)
MOVQ 168(R9), R11
VMOVDQU64 Z28, (R11)(R10*1)
MOVQ 192(R9), R11
VMOVDQU64 Z29, (R11)(R10*1)
// Prepare for next loop
ADDQ $0x40, R10
DECQ AX
JNZ mulGFNI_5x9_64Xor_loop
VZEROUPPER
mulGFNI_5x9_64Xor_end:
RET
// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88
// Loading 5 of 45 tables to registers
// Destination kept on stack
// Full registers estimated 56 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ start+72(FP), R10
// Add start offset to input
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, DX
mulAvxGFNI_5x9Xor_loop:
// Load 9 outputs
MOVQ (R9), R11
VMOVDQU (R11)(R10*1), Y5
MOVQ 24(R9), R11
VMOVDQU (R11)(R10*1), Y6
MOVQ 48(R9), R11
VMOVDQU (R11)(R10*1), Y7
MOVQ 72(R9), R11
VMOVDQU (R11)(R10*1), Y8
MOVQ 96(R9), R11
VMOVDQU (R11)(R10*1), Y9
MOVQ 120(R9), R11
VMOVDQU (R11)(R10*1), Y10
MOVQ 144(R9), R11
VMOVDQU (R11)(R10*1), Y11
MOVQ 168(R9), R11
VMOVDQU (R11)(R10*1), Y12
MOVQ 192(R9), R11
VMOVDQU (R11)(R10*1), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R9), R11
VMOVDQU Y5, (R11)(R10*1)
MOVQ 24(R9), R11
VMOVDQU Y6, (R11)(R10*1)
MOVQ 48(R9), R11
VMOVDQU Y7, (R11)(R10*1)
MOVQ 72(R9), R11
VMOVDQU Y8, (R11)(R10*1)
MOVQ 96(R9), R11
VMOVDQU Y9, (R11)(R10*1)
MOVQ 120(R9), R11
VMOVDQU Y10, (R11)(R10*1)
MOVQ 144(R9), R11
VMOVDQU Y11, (R11)(R10*1)
MOVQ 168(R9), R11
VMOVDQU Y12, (R11)(R10*1)
MOVQ 192(R9), R11
VMOVDQU Y13, (R11)(R10*1)
// Prepare for next loop
ADDQ $0x20, R10
DECQ AX
JNZ mulAvxGFNI_5x9Xor_loop
VZEROUPPER
mulAvxGFNI_5x9Xor_end:
RET
// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x10_64(SB), $0-88
// Loading 20 of 50 tables to registers
// Destination kept on stack
// Full registers estimated 62 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ start+72(FP), R10
// Add start offset to input
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, DX
mulGFNI_5x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R9), R11
VMOVDQU64 Z20, (R11)(R10*1)
MOVQ 24(R9), R11
VMOVDQU64 Z21, (R11)(R10*1)
MOVQ 48(R9), R11
VMOVDQU64 Z22, (R11)(R10*1)
MOVQ 72(R9), R11
VMOVDQU64 Z23, (R11)(R10*1)
MOVQ 96(R9), R11
VMOVDQU64 Z24, (R11)(R10*1)
MOVQ 120(R9), R11
VMOVDQU64 Z25, (R11)(R10*1)
MOVQ 144(R9), R11
VMOVDQU64 Z26, (R11)(R10*1)
MOVQ 168(R9), R11
VMOVDQU64 Z27, (R11)(R10*1)
MOVQ 192(R9), R11
VMOVDQU64 Z28, (R11)(R10*1)
MOVQ 216(R9), R11
VMOVDQU64 Z29, (R11)(R10*1)
// Prepare for next loop
ADDQ $0x40, R10
DECQ AX
JNZ mulGFNI_5x10_64_loop
VZEROUPPER
mulGFNI_5x10_64_end:
RET
// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x10(SB), $0-88
// Loading 4 of 50 tables to registers
// Destination kept on stack
// Full registers estimated 62 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ start+72(FP), R10
// Add start offset to input
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, DX
mulAvxGFNI_5x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R9), R11
VMOVDQU Y4, (R11)(R10*1)
MOVQ 24(R9), R11
VMOVDQU Y5, (R11)(R10*1)
MOVQ 48(R9), R11
VMOVDQU Y6, (R11)(R10*1)
MOVQ 72(R9), R11
VMOVDQU Y7, (R11)(R10*1)
MOVQ 96(R9), R11
VMOVDQU Y8, (R11)(R10*1)
MOVQ 120(R9), R11
VMOVDQU Y9, (R11)(R10*1)
MOVQ 144(R9), R11
VMOVDQU Y10, (R11)(R10*1)
MOVQ 168(R9), R11
VMOVDQU Y11, (R11)(R10*1)
MOVQ 192(R9), R11
VMOVDQU Y12, (R11)(R10*1)
MOVQ 216(R9), R11
VMOVDQU Y13, (R11)(R10*1)
// Prepare for next loop
ADDQ $0x20, R10
DECQ AX
JNZ mulAvxGFNI_5x10_loop
VZEROUPPER
mulAvxGFNI_5x10_end:
RET
// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_5x10_64Xor(SB), $0-88
// Loading 20 of 50 tables to registers
// Destination kept on stack
// Full registers estimated 62 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_5x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ start+72(FP), R10
// Add start offset to input
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, DX
mulGFNI_5x10_64Xor_loop:
// Load 10 outputs
MOVQ (R9), R11
VMOVDQU64 (R11)(R10*1), Z20
MOVQ 24(R9), R11
VMOVDQU64 (R11)(R10*1), Z21
MOVQ 48(R9), R11
VMOVDQU64 (R11)(R10*1), Z22
MOVQ 72(R9), R11
VMOVDQU64 (R11)(R10*1), Z23
MOVQ 96(R9), R11
VMOVDQU64 (R11)(R10*1), Z24
MOVQ 120(R9), R11
VMOVDQU64 (R11)(R10*1), Z25
MOVQ 144(R9), R11
VMOVDQU64 (R11)(R10*1), Z26
MOVQ 168(R9), R11
VMOVDQU64 (R11)(R10*1), Z27
MOVQ 192(R9), R11
VMOVDQU64 (R11)(R10*1), Z28
MOVQ 216(R9), R11
VMOVDQU64 (R11)(R10*1), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R9), R11
VMOVDQU64 Z20, (R11)(R10*1)
MOVQ 24(R9), R11
VMOVDQU64 Z21, (R11)(R10*1)
MOVQ 48(R9), R11
VMOVDQU64 Z22, (R11)(R10*1)
MOVQ 72(R9), R11
VMOVDQU64 Z23, (R11)(R10*1)
MOVQ 96(R9), R11
VMOVDQU64 Z24, (R11)(R10*1)
MOVQ 120(R9), R11
VMOVDQU64 Z25, (R11)(R10*1)
MOVQ 144(R9), R11
VMOVDQU64 Z26, (R11)(R10*1)
MOVQ 168(R9), R11
VMOVDQU64 Z27, (R11)(R10*1)
MOVQ 192(R9), R11
VMOVDQU64 Z28, (R11)(R10*1)
MOVQ 216(R9), R11
VMOVDQU64 Z29, (R11)(R10*1)
// Prepare for next loop
ADDQ $0x40, R10
DECQ AX
JNZ mulGFNI_5x10_64Xor_loop
VZEROUPPER
mulGFNI_5x10_64Xor_end:
RET
// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88
// Loading 4 of 50 tables to registers
// Destination kept on stack
// Full registers estimated 62 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_5x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), DX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ start+72(FP), R10
// Add start offset to input
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, DX
mulAvxGFNI_5x10Xor_loop:
// Load 10 outputs
MOVQ (R9), R11
VMOVDQU (R11)(R10*1), Y4
MOVQ 24(R9), R11
VMOVDQU (R11)(R10*1), Y5
MOVQ 48(R9), R11
VMOVDQU (R11)(R10*1), Y6
MOVQ 72(R9), R11
VMOVDQU (R11)(R10*1), Y7
MOVQ 96(R9), R11
VMOVDQU (R11)(R10*1), Y8
MOVQ 120(R9), R11
VMOVDQU (R11)(R10*1), Y9
MOVQ 144(R9), R11
VMOVDQU (R11)(R10*1), Y10
MOVQ 168(R9), R11
VMOVDQU (R11)(R10*1), Y11
MOVQ 192(R9), R11
VMOVDQU (R11)(R10*1), Y12
MOVQ 216(R9), R11
VMOVDQU (R11)(R10*1), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R9), R11
VMOVDQU Y4, (R11)(R10*1)
MOVQ 24(R9), R11
VMOVDQU Y5, (R11)(R10*1)
MOVQ 48(R9), R11
VMOVDQU Y6, (R11)(R10*1)
MOVQ 72(R9), R11
VMOVDQU Y7, (R11)(R10*1)
MOVQ 96(R9), R11
VMOVDQU Y8, (R11)(R10*1)
MOVQ 120(R9), R11
VMOVDQU Y9, (R11)(R10*1)
MOVQ 144(R9), R11
VMOVDQU Y10, (R11)(R10*1)
MOVQ 168(R9), R11
VMOVDQU Y11, (R11)(R10*1)
MOVQ 192(R9), R11
VMOVDQU Y12, (R11)(R10*1)
MOVQ 216(R9), R11
VMOVDQU Y13, (R11)(R10*1)
// Prepare for next loop
ADDQ $0x20, R10
DECQ AX
JNZ mulAvxGFNI_5x10Xor_loop
VZEROUPPER
mulAvxGFNI_5x10Xor_end:
RET
// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 9 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R9
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R9
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, CX
mulGFNI_6x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z7
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z7, Z6
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z7
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z7
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z7
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z7
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (CX), Z7
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z5, Z7, Z7
VXORPD Z6, Z7, Z6
// Store 1 outputs
VMOVDQU64 Z6, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x1_64_loop
VZEROUPPER
mulGFNI_6x1_64_end:
RET
// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 9 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R9
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R9
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, CX
mulAvxGFNI_6x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y7
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y7, Y6
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y7
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y7
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y7
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y7
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (CX), Y7
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y5, Y7, Y7
VXORPD Y6, Y7, Y6
// Store 1 outputs
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x1_loop
VZEROUPPER
mulAvxGFNI_6x1_end:
RET
// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 9 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R9
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R9
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, CX
mulGFNI_6x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (R9), Z6
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z7
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z7
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z7
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z7
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z7
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z7, Z7
VXORPD Z6, Z7, Z6
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (CX), Z7
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z5, Z7, Z7
VXORPD Z6, Z7, Z6
// Store 1 outputs
VMOVDQU64 Z6, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x1_64Xor_loop
VZEROUPPER
mulGFNI_6x1_64Xor_end:
RET
// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 9 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R9
MOVQ start+72(FP), R10
// Add start offset to output
ADDQ R10, R9
// Add start offset to input
ADDQ R10, DX
ADDQ R10, BX
ADDQ R10, SI
ADDQ R10, DI
ADDQ R10, R8
ADDQ R10, CX
mulAvxGFNI_6x1Xor_loop:
// Load 1 outputs
VMOVDQU (R9), Y6
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y7
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y7
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y7
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y7
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y7
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y7, Y7
VXORPD Y6, Y7, Y6
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (CX), Y7
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y5, Y7, Y7
VXORPD Y6, Y7, Y6
// Store 1 outputs
VMOVDQU Y6, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x1Xor_loop
VZEROUPPER
mulAvxGFNI_6x1Xor_end:
RET
// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 16 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R9
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R10
ADDQ R11, R9
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, CX
mulGFNI_6x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z14
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z14, Z12
VGF2P8AFFINEQB $0x00, Z1, Z14, Z13
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z14
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z14
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z14
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z7, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z14
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z9, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (CX), Z14
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z10, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z11, Z14, Z15
VXORPD Z13, Z15, Z13
// Store 2 outputs
VMOVDQU64 Z12, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z13, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x2_64_loop
VZEROUPPER
mulGFNI_6x2_64_end:
RET
// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x2(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 16 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R9
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R10
ADDQ R11, R9
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, CX
mulAvxGFNI_6x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (CX), Y14
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x2_loop
VZEROUPPER
mulAvxGFNI_6x2_end:
RET
// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 16 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R9
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R10
ADDQ R11, R9
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, CX
mulGFNI_6x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (R10), Z12
VMOVDQU64 (R9), Z13
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z14
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z1, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z14
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z3, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z14
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z5, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z14
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z7, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z14
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z9, Z14, Z15
VXORPD Z13, Z15, Z13
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (CX), Z14
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z10, Z14, Z15
VXORPD Z12, Z15, Z12
VGF2P8AFFINEQB $0x00, Z11, Z14, Z15
VXORPD Z13, Z15, Z13
// Store 2 outputs
VMOVDQU64 Z12, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z13, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x2_64Xor_loop
VZEROUPPER
mulGFNI_6x2_64Xor_end:
RET
// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 16 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R9
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R10
ADDQ R11, R9
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, CX
mulAvxGFNI_6x2Xor_loop:
// Load 2 outputs
VMOVDQU (R10), Y12
VMOVDQU (R9), Y13
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (CX), Y14
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R10)
ADDQ $0x20, R10
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x2Xor_loop
VZEROUPPER
mulAvxGFNI_6x2Xor_end:
RET
// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 23 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R9
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, R9
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, CX
mulGFNI_6x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z21
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z21, Z18
VGF2P8AFFINEQB $0x00, Z1, Z21, Z19
VGF2P8AFFINEQB $0x00, Z2, Z21, Z20
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z21
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z21
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z21
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z21
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z14, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (CX), Z21
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z15, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z16, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z17, Z21, Z22
VXORPD Z20, Z22, Z20
// Store 3 outputs
VMOVDQU64 Z18, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z19, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z20, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x3_64_loop
VZEROUPPER
mulGFNI_6x3_64_end:
RET
// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x3(SB), $0-88
// Loading 11 of 18 tables to registers
// Destination kept in GP registers
// Full registers estimated 23 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R10
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R10
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, DX
mulAvxGFNI_6x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x3_loop
VZEROUPPER
mulAvxGFNI_6x3_end:
RET
// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 23 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R9
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R10
ADDQ R12, R11
ADDQ R12, R9
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, CX
mulGFNI_6x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (R10), Z18
VMOVDQU64 (R11), Z19
VMOVDQU64 (R9), Z20
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z21
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z1, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z2, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z21
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z4, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z5, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z21
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z7, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z8, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z21
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z10, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z11, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z21
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z13, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z14, Z21, Z22
VXORPD Z20, Z22, Z20
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (CX), Z21
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z15, Z21, Z22
VXORPD Z18, Z22, Z18
VGF2P8AFFINEQB $0x00, Z16, Z21, Z22
VXORPD Z19, Z22, Z19
VGF2P8AFFINEQB $0x00, Z17, Z21, Z22
VXORPD Z20, Z22, Z20
// Store 3 outputs
VMOVDQU64 Z18, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z19, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z20, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x3_64Xor_loop
VZEROUPPER
mulGFNI_6x3_64Xor_end:
RET
// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88
// Loading 11 of 18 tables to registers
// Destination kept in GP registers
// Full registers estimated 23 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R10
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R10
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, DX
mulAvxGFNI_6x3Xor_loop:
// Load 3 outputs
VMOVDQU (R11), Y11
VMOVDQU (R12), Y12
VMOVDQU (R10), Y13
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R11)
ADDQ $0x20, R11
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x3Xor_loop
VZEROUPPER
mulAvxGFNI_6x3Xor_end:
RET
// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x4_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 30 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R9
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R9
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, CX
mulGFNI_6x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z28
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z28, Z24
VGF2P8AFFINEQB $0x00, Z1, Z28, Z25
VGF2P8AFFINEQB $0x00, Z2, Z28, Z26
VGF2P8AFFINEQB $0x00, Z3, Z28, Z27
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z28
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (SI), Z28
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (DI), Z28
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R8), Z28
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (CX), Z28
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z21, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z22, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z23, Z28, Z29
VXORPD Z27, Z29, Z27
// Store 4 outputs
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x4_64_loop
VZEROUPPER
mulGFNI_6x4_64_end:
RET
// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x4(SB), $0-88
// Loading 10 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 30 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R10
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R10
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, DX
mulAvxGFNI_6x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x4_loop
VZEROUPPER
mulAvxGFNI_6x4_end:
RET
// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x4_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 30 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), CX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R9
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R9
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, CX
mulGFNI_6x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (R10), Z24
VMOVDQU64 (R11), Z25
VMOVDQU64 (R12), Z26
VMOVDQU64 (R9), Z27
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z28
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z1, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z2, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z3, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z28
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z5, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z6, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z7, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (SI), Z28
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z9, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z10, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z11, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (DI), Z28
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z13, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z14, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z15, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R8), Z28
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z16, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z17, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z18, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z19, Z28, Z29
VXORPD Z27, Z29, Z27
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (CX), Z28
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z20, Z28, Z29
VXORPD Z24, Z29, Z24
VGF2P8AFFINEQB $0x00, Z21, Z28, Z29
VXORPD Z25, Z29, Z25
VGF2P8AFFINEQB $0x00, Z22, Z28, Z29
VXORPD Z26, Z29, Z26
VGF2P8AFFINEQB $0x00, Z23, Z28, Z29
VXORPD Z27, Z29, Z27
// Store 4 outputs
VMOVDQU64 Z24, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x4_64Xor_loop
VZEROUPPER
mulGFNI_6x4_64Xor_end:
RET
// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88
// Loading 10 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 30 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R10
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R10
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, DX
mulAvxGFNI_6x4Xor_loop:
// Load 4 outputs
VMOVDQU (R11), Y10
VMOVDQU (R12), Y11
VMOVDQU (R13), Y12
VMOVDQU (R10), Y13
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R11)
ADDQ $0x20, R11
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x4Xor_loop
VZEROUPPER
mulAvxGFNI_6x4Xor_end:
RET
// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x5_64(SB), $0-88
// Loading 25 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 37 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R10
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R10
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, DX
mulGFNI_6x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x5_64_loop
VZEROUPPER
mulGFNI_6x5_64_end:
RET
// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x5(SB), $0-88
// Loading 9 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 37 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R10
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R10
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, DX
mulAvxGFNI_6x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x5_loop
VZEROUPPER
mulAvxGFNI_6x5_end:
RET
// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x5_64Xor(SB), $0-88
// Loading 25 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 37 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R10
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R10
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, DX
mulGFNI_6x5_64Xor_loop:
// Load 5 outputs
VMOVDQU64 (R11), Z25
VMOVDQU64 (R12), Z26
VMOVDQU64 (R13), Z27
VMOVDQU64 (R14), Z28
VMOVDQU64 (R10), Z29
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
VMOVDQU64 Z25, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x5_64Xor_loop
VZEROUPPER
mulGFNI_6x5_64Xor_end:
RET
// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88
// Loading 9 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 37 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R10
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R10
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, DX
mulAvxGFNI_6x5Xor_loop:
// Load 5 outputs
VMOVDQU (R11), Y9
VMOVDQU (R12), Y10
VMOVDQU (R13), Y11
VMOVDQU (R14), Y12
VMOVDQU (R10), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R11)
ADDQ $0x20, R11
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x5Xor_loop
VZEROUPPER
mulAvxGFNI_6x5Xor_end:
RET
// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x6_64(SB), $8-88
// Loading 24 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 44 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R15
MOVQ 120(R10), R10
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R10
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, DX
mulGFNI_6x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x6_64_loop
VZEROUPPER
mulGFNI_6x6_64_end:
RET
// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x6(SB), $8-88
// Loading 8 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 44 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R15
MOVQ 120(R10), R10
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R10
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, DX
mulAvxGFNI_6x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x6_loop
VZEROUPPER
mulAvxGFNI_6x6_end:
RET
// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x6_64Xor(SB), $8-88
// Loading 24 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 44 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R15
MOVQ 120(R10), R10
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R10
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, DX
mulGFNI_6x6_64Xor_loop:
// Load 6 outputs
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R10), Z29
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_6x6_64Xor_loop
VZEROUPPER
mulGFNI_6x6_64Xor_end:
RET
// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88
// Loading 8 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 44 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R15
MOVQ 120(R10), R10
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R10
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, DX
mulAvxGFNI_6x6Xor_loop:
// Load 6 outputs
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R10), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_6x6Xor_loop
VZEROUPPER
mulAvxGFNI_6x6Xor_end:
RET
// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x7_64(SB), $8-88
// Loading 23 of 42 tables to registers
// Destination kept in GP registers
// Full registers estimated 51 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), AX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R15
MOVQ 144(R9), R9
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R9
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_6x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ BP
JNZ mulGFNI_6x7_64_loop
VZEROUPPER
mulGFNI_6x7_64_end:
RET
// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x7(SB), $8-88
// Loading 7 of 42 tables to registers
// Destination kept in GP registers
// Full registers estimated 51 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), AX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R15
MOVQ 144(R9), R9
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R9
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_6x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_6x7_loop
VZEROUPPER
mulAvxGFNI_6x7_end:
RET
// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x7_64Xor(SB), $8-88
// Loading 23 of 42 tables to registers
// Destination kept in GP registers
// Full registers estimated 51 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), AX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R15
MOVQ 144(R9), R9
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R9
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_6x7_64Xor_loop:
// Load 7 outputs
VMOVDQU64 (R10), Z23
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R9), Z29
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R9)
ADDQ $0x40, R9
// Prepare for next loop
DECQ BP
JNZ mulGFNI_6x7_64Xor_loop
VZEROUPPER
mulGFNI_6x7_64Xor_end:
RET
// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88
// Loading 7 of 42 tables to registers
// Destination kept in GP registers
// Full registers estimated 51 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), AX
MOVQ out_base+48(FP), R9
MOVQ out_base+48(FP), R9
MOVQ (R9), R10
MOVQ 24(R9), R11
MOVQ 48(R9), R12
MOVQ 72(R9), R13
MOVQ 96(R9), R14
MOVQ 120(R9), R15
MOVQ 144(R9), R9
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R9
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_6x7Xor_loop:
// Load 7 outputs
VMOVDQU (R10), Y7
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R9), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R9)
ADDQ $0x20, R9
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_6x7Xor_loop
VZEROUPPER
mulAvxGFNI_6x7Xor_end:
RET
// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x8_64(SB), $0-88
// Loading 22 of 48 tables to registers
// Destination kept on stack
// Full registers estimated 58 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulGFNI_6x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R10), R12
VMOVDQU64 Z22, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU64 Z23, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU64 Z24, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU64 Z25, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU64 Z26, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU64 Z27, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU64 Z28, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU64 Z29, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x40, R11
DECQ AX
JNZ mulGFNI_6x8_64_loop
VZEROUPPER
mulGFNI_6x8_64_end:
RET
// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x8(SB), $0-88
// Loading 6 of 48 tables to registers
// Destination kept on stack
// Full registers estimated 58 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulAvxGFNI_6x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R10), R12
VMOVDQU Y6, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU Y7, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU Y8, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU Y9, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU Y10, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU Y11, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU Y12, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU Y13, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x20, R11
DECQ AX
JNZ mulAvxGFNI_6x8_loop
VZEROUPPER
mulAvxGFNI_6x8_end:
RET
// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x8_64Xor(SB), $0-88
// Loading 22 of 48 tables to registers
// Destination kept on stack
// Full registers estimated 58 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulGFNI_6x8_64Xor_loop:
// Load 8 outputs
MOVQ (R10), R12
VMOVDQU64 (R12)(R11*1), Z22
MOVQ 24(R10), R12
VMOVDQU64 (R12)(R11*1), Z23
MOVQ 48(R10), R12
VMOVDQU64 (R12)(R11*1), Z24
MOVQ 72(R10), R12
VMOVDQU64 (R12)(R11*1), Z25
MOVQ 96(R10), R12
VMOVDQU64 (R12)(R11*1), Z26
MOVQ 120(R10), R12
VMOVDQU64 (R12)(R11*1), Z27
MOVQ 144(R10), R12
VMOVDQU64 (R12)(R11*1), Z28
MOVQ 168(R10), R12
VMOVDQU64 (R12)(R11*1), Z29
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R10), R12
VMOVDQU64 Z22, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU64 Z23, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU64 Z24, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU64 Z25, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU64 Z26, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU64 Z27, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU64 Z28, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU64 Z29, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x40, R11
DECQ AX
JNZ mulGFNI_6x8_64Xor_loop
VZEROUPPER
mulGFNI_6x8_64Xor_end:
RET
// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88
// Loading 6 of 48 tables to registers
// Destination kept on stack
// Full registers estimated 58 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulAvxGFNI_6x8Xor_loop:
// Load 8 outputs
MOVQ (R10), R12
VMOVDQU (R12)(R11*1), Y6
MOVQ 24(R10), R12
VMOVDQU (R12)(R11*1), Y7
MOVQ 48(R10), R12
VMOVDQU (R12)(R11*1), Y8
MOVQ 72(R10), R12
VMOVDQU (R12)(R11*1), Y9
MOVQ 96(R10), R12
VMOVDQU (R12)(R11*1), Y10
MOVQ 120(R10), R12
VMOVDQU (R12)(R11*1), Y11
MOVQ 144(R10), R12
VMOVDQU (R12)(R11*1), Y12
MOVQ 168(R10), R12
VMOVDQU (R12)(R11*1), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R10), R12
VMOVDQU Y6, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU Y7, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU Y8, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU Y9, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU Y10, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU Y11, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU Y12, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU Y13, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x20, R11
DECQ AX
JNZ mulAvxGFNI_6x8Xor_loop
VZEROUPPER
mulAvxGFNI_6x8Xor_end:
RET
// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x9_64(SB), $0-88
// Loading 21 of 54 tables to registers
// Destination kept on stack
// Full registers estimated 65 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulGFNI_6x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R10), R12
VMOVDQU64 Z21, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU64 Z22, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU64 Z23, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU64 Z24, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU64 Z25, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU64 Z26, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU64 Z27, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU64 Z28, (R12)(R11*1)
MOVQ 192(R10), R12
VMOVDQU64 Z29, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x40, R11
DECQ AX
JNZ mulGFNI_6x9_64_loop
VZEROUPPER
mulGFNI_6x9_64_end:
RET
// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x9(SB), $0-88
// Loading 5 of 54 tables to registers
// Destination kept on stack
// Full registers estimated 65 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulAvxGFNI_6x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R10), R12
VMOVDQU Y5, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU Y6, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU Y7, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU Y8, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU Y9, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU Y10, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU Y11, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU Y12, (R12)(R11*1)
MOVQ 192(R10), R12
VMOVDQU Y13, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x20, R11
DECQ AX
JNZ mulAvxGFNI_6x9_loop
VZEROUPPER
mulAvxGFNI_6x9_end:
RET
// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x9_64Xor(SB), $0-88
// Loading 21 of 54 tables to registers
// Destination kept on stack
// Full registers estimated 65 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulGFNI_6x9_64Xor_loop:
// Load 9 outputs
MOVQ (R10), R12
VMOVDQU64 (R12)(R11*1), Z21
MOVQ 24(R10), R12
VMOVDQU64 (R12)(R11*1), Z22
MOVQ 48(R10), R12
VMOVDQU64 (R12)(R11*1), Z23
MOVQ 72(R10), R12
VMOVDQU64 (R12)(R11*1), Z24
MOVQ 96(R10), R12
VMOVDQU64 (R12)(R11*1), Z25
MOVQ 120(R10), R12
VMOVDQU64 (R12)(R11*1), Z26
MOVQ 144(R10), R12
VMOVDQU64 (R12)(R11*1), Z27
MOVQ 168(R10), R12
VMOVDQU64 (R12)(R11*1), Z28
MOVQ 192(R10), R12
VMOVDQU64 (R12)(R11*1), Z29
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R10), R12
VMOVDQU64 Z21, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU64 Z22, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU64 Z23, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU64 Z24, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU64 Z25, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU64 Z26, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU64 Z27, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU64 Z28, (R12)(R11*1)
MOVQ 192(R10), R12
VMOVDQU64 Z29, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x40, R11
DECQ AX
JNZ mulGFNI_6x9_64Xor_loop
VZEROUPPER
mulGFNI_6x9_64Xor_end:
RET
// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88
// Loading 5 of 54 tables to registers
// Destination kept on stack
// Full registers estimated 65 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulAvxGFNI_6x9Xor_loop:
// Load 9 outputs
MOVQ (R10), R12
VMOVDQU (R12)(R11*1), Y5
MOVQ 24(R10), R12
VMOVDQU (R12)(R11*1), Y6
MOVQ 48(R10), R12
VMOVDQU (R12)(R11*1), Y7
MOVQ 72(R10), R12
VMOVDQU (R12)(R11*1), Y8
MOVQ 96(R10), R12
VMOVDQU (R12)(R11*1), Y9
MOVQ 120(R10), R12
VMOVDQU (R12)(R11*1), Y10
MOVQ 144(R10), R12
VMOVDQU (R12)(R11*1), Y11
MOVQ 168(R10), R12
VMOVDQU (R12)(R11*1), Y12
MOVQ 192(R10), R12
VMOVDQU (R12)(R11*1), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R10), R12
VMOVDQU Y5, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU Y6, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU Y7, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU Y8, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU Y9, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU Y10, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU Y11, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU Y12, (R12)(R11*1)
MOVQ 192(R10), R12
VMOVDQU Y13, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x20, R11
DECQ AX
JNZ mulAvxGFNI_6x9Xor_loop
VZEROUPPER
mulAvxGFNI_6x9Xor_end:
RET
// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x10_64(SB), $0-88
// Loading 20 of 60 tables to registers
// Destination kept on stack
// Full registers estimated 72 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulGFNI_6x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R10), R12
VMOVDQU64 Z20, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU64 Z21, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU64 Z22, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU64 Z23, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU64 Z24, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU64 Z25, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU64 Z26, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU64 Z27, (R12)(R11*1)
MOVQ 192(R10), R12
VMOVDQU64 Z28, (R12)(R11*1)
MOVQ 216(R10), R12
VMOVDQU64 Z29, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x40, R11
DECQ AX
JNZ mulGFNI_6x10_64_loop
VZEROUPPER
mulGFNI_6x10_64_end:
RET
// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x10(SB), $0-88
// Loading 4 of 60 tables to registers
// Destination kept on stack
// Full registers estimated 72 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulAvxGFNI_6x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R10), R12
VMOVDQU Y4, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU Y5, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU Y6, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU Y7, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU Y8, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU Y9, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU Y10, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU Y11, (R12)(R11*1)
MOVQ 192(R10), R12
VMOVDQU Y12, (R12)(R11*1)
MOVQ 216(R10), R12
VMOVDQU Y13, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x20, R11
DECQ AX
JNZ mulAvxGFNI_6x10_loop
VZEROUPPER
mulAvxGFNI_6x10_end:
RET
// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_6x10_64Xor(SB), $0-88
// Loading 20 of 60 tables to registers
// Destination kept on stack
// Full registers estimated 72 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_6x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulGFNI_6x10_64Xor_loop:
// Load 10 outputs
MOVQ (R10), R12
VMOVDQU64 (R12)(R11*1), Z20
MOVQ 24(R10), R12
VMOVDQU64 (R12)(R11*1), Z21
MOVQ 48(R10), R12
VMOVDQU64 (R12)(R11*1), Z22
MOVQ 72(R10), R12
VMOVDQU64 (R12)(R11*1), Z23
MOVQ 96(R10), R12
VMOVDQU64 (R12)(R11*1), Z24
MOVQ 120(R10), R12
VMOVDQU64 (R12)(R11*1), Z25
MOVQ 144(R10), R12
VMOVDQU64 (R12)(R11*1), Z26
MOVQ 168(R10), R12
VMOVDQU64 (R12)(R11*1), Z27
MOVQ 192(R10), R12
VMOVDQU64 (R12)(R11*1), Z28
MOVQ 216(R10), R12
VMOVDQU64 (R12)(R11*1), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R10), R12
VMOVDQU64 Z20, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU64 Z21, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU64 Z22, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU64 Z23, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU64 Z24, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU64 Z25, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU64 Z26, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU64 Z27, (R12)(R11*1)
MOVQ 192(R10), R12
VMOVDQU64 Z28, (R12)(R11*1)
MOVQ 216(R10), R12
VMOVDQU64 Z29, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x40, R11
DECQ AX
JNZ mulGFNI_6x10_64Xor_loop
VZEROUPPER
mulGFNI_6x10_64Xor_end:
RET
// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88
// Loading 4 of 60 tables to registers
// Destination kept on stack
// Full registers estimated 72 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_6x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), DX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ start+72(FP), R11
// Add start offset to input
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, DX
mulAvxGFNI_6x10Xor_loop:
// Load 10 outputs
MOVQ (R10), R12
VMOVDQU (R12)(R11*1), Y4
MOVQ 24(R10), R12
VMOVDQU (R12)(R11*1), Y5
MOVQ 48(R10), R12
VMOVDQU (R12)(R11*1), Y6
MOVQ 72(R10), R12
VMOVDQU (R12)(R11*1), Y7
MOVQ 96(R10), R12
VMOVDQU (R12)(R11*1), Y8
MOVQ 120(R10), R12
VMOVDQU (R12)(R11*1), Y9
MOVQ 144(R10), R12
VMOVDQU (R12)(R11*1), Y10
MOVQ 168(R10), R12
VMOVDQU (R12)(R11*1), Y11
MOVQ 192(R10), R12
VMOVDQU (R12)(R11*1), Y12
MOVQ 216(R10), R12
VMOVDQU (R12)(R11*1), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R10), R12
VMOVDQU Y4, (R12)(R11*1)
MOVQ 24(R10), R12
VMOVDQU Y5, (R12)(R11*1)
MOVQ 48(R10), R12
VMOVDQU Y6, (R12)(R11*1)
MOVQ 72(R10), R12
VMOVDQU Y7, (R12)(R11*1)
MOVQ 96(R10), R12
VMOVDQU Y8, (R12)(R11*1)
MOVQ 120(R10), R12
VMOVDQU Y9, (R12)(R11*1)
MOVQ 144(R10), R12
VMOVDQU Y10, (R12)(R11*1)
MOVQ 168(R10), R12
VMOVDQU Y11, (R12)(R11*1)
MOVQ 192(R10), R12
VMOVDQU Y12, (R12)(R11*1)
MOVQ 216(R10), R12
VMOVDQU Y13, (R12)(R11*1)
// Prepare for next loop
ADDQ $0x20, R11
DECQ AX
JNZ mulAvxGFNI_6x10Xor_loop
VZEROUPPER
mulAvxGFNI_6x10Xor_end:
RET
// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), CX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R10
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R10
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, CX
mulGFNI_7x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z8
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z8, Z7
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z8
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z8
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z8
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z8
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (R9), Z8
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z5, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 6 to 1 outputs
VMOVDQU64 (CX), Z8
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z6, Z8, Z8
VXORPD Z7, Z8, Z7
// Store 1 outputs
VMOVDQU64 Z7, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x1_64_loop
VZEROUPPER
mulGFNI_7x1_64_end:
RET
// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), CX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R10
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R10
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, CX
mulAvxGFNI_7x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y8
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y8, Y7
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y8
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y8
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y8
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y8
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (R9), Y8
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y5, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 6 to 1 outputs
VMOVDQU (CX), Y8
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y6, Y8, Y8
VXORPD Y7, Y8, Y7
// Store 1 outputs
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x1_loop
VZEROUPPER
mulAvxGFNI_7x1_end:
RET
// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), CX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R10
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R10
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, CX
mulGFNI_7x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (R10), Z7
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z8
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z8
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z8
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z8
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z8
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (R9), Z8
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z5, Z8, Z8
VXORPD Z7, Z8, Z7
// Load and process 64 bytes from input 6 to 1 outputs
VMOVDQU64 (CX), Z8
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z6, Z8, Z8
VXORPD Z7, Z8, Z7
// Store 1 outputs
VMOVDQU64 Z7, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x1_64Xor_loop
VZEROUPPER
mulGFNI_7x1_64Xor_end:
RET
// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 10 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), CX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R10
MOVQ start+72(FP), R11
// Add start offset to output
ADDQ R11, R10
// Add start offset to input
ADDQ R11, DX
ADDQ R11, BX
ADDQ R11, SI
ADDQ R11, DI
ADDQ R11, R8
ADDQ R11, R9
ADDQ R11, CX
mulAvxGFNI_7x1Xor_loop:
// Load 1 outputs
VMOVDQU (R10), Y7
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y8
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y8
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y8
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y8
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y8
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (R9), Y8
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y5, Y8, Y8
VXORPD Y7, Y8, Y7
// Load and process 32 bytes from input 6 to 1 outputs
VMOVDQU (CX), Y8
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y6, Y8, Y8
VXORPD Y7, Y8, Y7
// Store 1 outputs
VMOVDQU Y7, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x1Xor_loop
VZEROUPPER
mulAvxGFNI_7x1Xor_end:
RET
// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), CX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R10
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R11
ADDQ R12, R10
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, CX
mulGFNI_7x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z16
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z16, Z14
VGF2P8AFFINEQB $0x00, Z1, Z16, Z15
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z16
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z16
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z16
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z16
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (R9), Z16
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 6 to 2 outputs
VMOVDQU64 (CX), Z16
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z12, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z13, Z16, Z17
VXORPD Z15, Z17, Z15
// Store 2 outputs
VMOVDQU64 Z14, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z15, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x2_64_loop
VZEROUPPER
mulGFNI_7x2_64_end:
RET
// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x2(SB), $0-88
// Loading 12 of 14 tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R11
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R12
ADDQ R13, R11
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, DX
mulAvxGFNI_7x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x2_loop
VZEROUPPER
mulAvxGFNI_7x2_end:
RET
// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), CX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R10
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R11
ADDQ R12, R10
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, CX
mulGFNI_7x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (R11), Z14
VMOVDQU64 (R10), Z15
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z16
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z1, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z16
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z3, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z16
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z5, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z16
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z7, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z16
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z9, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (R9), Z16
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z10, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z11, Z16, Z17
VXORPD Z15, Z17, Z15
// Load and process 64 bytes from input 6 to 2 outputs
VMOVDQU64 (CX), Z16
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z12, Z16, Z17
VXORPD Z14, Z17, Z14
VGF2P8AFFINEQB $0x00, Z13, Z16, Z17
VXORPD Z15, Z17, Z15
// Store 2 outputs
VMOVDQU64 Z14, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z15, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x2_64Xor_loop
VZEROUPPER
mulGFNI_7x2_64Xor_end:
RET
// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88
// Loading 12 of 14 tables to registers
// Destination kept in GP registers
// Full registers estimated 18 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R11
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R12
ADDQ R13, R11
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, DX
mulAvxGFNI_7x2Xor_loop:
// Load 2 outputs
VMOVDQU (R12), Y12
VMOVDQU (R11), Y13
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R12)
ADDQ $0x20, R12
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x2Xor_loop
VZEROUPPER
mulAvxGFNI_7x2Xor_end:
RET
// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), CX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R10
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R10
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, CX
mulGFNI_7x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z24
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z24, Z21
VGF2P8AFFINEQB $0x00, Z1, Z24, Z22
VGF2P8AFFINEQB $0x00, Z2, Z24, Z23
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z24
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z24
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z24
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z24
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (R9), Z24
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 6 to 3 outputs
VMOVDQU64 (CX), Z24
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z20, Z24, Z25
VXORPD Z23, Z25, Z23
// Store 3 outputs
VMOVDQU64 Z21, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z22, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x3_64_loop
VZEROUPPER
mulGFNI_7x3_64_end:
RET
// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x3(SB), $0-88
// Loading 11 of 21 tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R11
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R11
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, DX
mulAvxGFNI_7x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x3_loop
VZEROUPPER
mulAvxGFNI_7x3_end:
RET
// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), CX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R10
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R11
ADDQ R13, R12
ADDQ R13, R10
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, CX
mulGFNI_7x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (R11), Z21
VMOVDQU64 (R12), Z22
VMOVDQU64 (R10), Z23
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z24
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z1, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z2, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z24
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z4, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z5, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z24
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z7, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z8, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z24
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z10, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z11, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z24
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z13, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z14, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (R9), Z24
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z15, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z16, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z17, Z24, Z25
VXORPD Z23, Z25, Z23
// Load and process 64 bytes from input 6 to 3 outputs
VMOVDQU64 (CX), Z24
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z18, Z24, Z25
VXORPD Z21, Z25, Z21
VGF2P8AFFINEQB $0x00, Z19, Z24, Z25
VXORPD Z22, Z25, Z22
VGF2P8AFFINEQB $0x00, Z20, Z24, Z25
VXORPD Z23, Z25, Z23
// Store 3 outputs
VMOVDQU64 Z21, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z22, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z23, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x3_64Xor_loop
VZEROUPPER
mulGFNI_7x3_64Xor_end:
RET
// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88
// Loading 11 of 21 tables to registers
// Destination kept in GP registers
// Full registers estimated 26 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R11
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R11
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, DX
mulAvxGFNI_7x3Xor_loop:
// Load 3 outputs
VMOVDQU (R12), Y11
VMOVDQU (R13), Y12
VMOVDQU (R11), Y13
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R12)
ADDQ $0x20, R12
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x3Xor_loop
VZEROUPPER
mulAvxGFNI_7x3Xor_end:
RET
// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x4_64(SB), $0-88
// Loading 26 of 28 tables to registers
// Destination kept in GP registers
// Full registers estimated 34 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R11
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R11
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, DX
mulGFNI_7x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 4 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 4 outputs
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x4_64_loop
VZEROUPPER
mulGFNI_7x4_64_end:
RET
// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x4(SB), $0-88
// Loading 10 of 28 tables to registers
// Destination kept in GP registers
// Full registers estimated 34 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R11
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R11
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, DX
mulAvxGFNI_7x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x4_loop
VZEROUPPER
mulAvxGFNI_7x4_end:
RET
// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x4_64Xor(SB), $0-88
// Loading 26 of 28 tables to registers
// Destination kept in GP registers
// Full registers estimated 34 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R11
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R11
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, DX
mulGFNI_7x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (R12), Z26
VMOVDQU64 (R13), Z27
VMOVDQU64 (R14), Z28
VMOVDQU64 (R11), Z29
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 4 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 4 outputs
VMOVDQU64 Z26, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x4_64Xor_loop
VZEROUPPER
mulGFNI_7x4_64Xor_end:
RET
// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88
// Loading 10 of 28 tables to registers
// Destination kept in GP registers
// Full registers estimated 34 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R11
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R11
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, DX
mulAvxGFNI_7x4Xor_loop:
// Load 4 outputs
VMOVDQU (R12), Y10
VMOVDQU (R13), Y11
VMOVDQU (R14), Y12
VMOVDQU (R11), Y13
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R12)
ADDQ $0x20, R12
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x4Xor_loop
VZEROUPPER
mulAvxGFNI_7x4Xor_end:
RET
// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x5_64(SB), $8-88
// Loading 25 of 35 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R15
MOVQ 96(R11), R11
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R11
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, DX
mulGFNI_7x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x5_64_loop
VZEROUPPER
mulGFNI_7x5_64_end:
RET
// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x5(SB), $8-88
// Loading 9 of 35 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R15
MOVQ 96(R11), R11
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R11
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, DX
mulAvxGFNI_7x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x5_loop
VZEROUPPER
mulAvxGFNI_7x5_end:
RET
// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x5_64Xor(SB), $8-88
// Loading 25 of 35 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R15
MOVQ 96(R11), R11
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R11
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, DX
mulGFNI_7x5_64Xor_loop:
// Load 5 outputs
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R11), Z29
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_7x5_64Xor_loop
VZEROUPPER
mulGFNI_7x5_64Xor_end:
RET
// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88
// Loading 9 of 35 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R15
MOVQ 96(R11), R11
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R11
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, DX
mulAvxGFNI_7x5Xor_loop:
// Load 5 outputs
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R11), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_7x5Xor_loop
VZEROUPPER
mulAvxGFNI_7x5Xor_end:
RET
// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x6_64(SB), $8-88
// Loading 24 of 42 tables to registers
// Destination kept in GP registers
// Full registers estimated 50 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), AX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R15
MOVQ 120(R10), R10
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R10
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_7x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 6 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ BP
JNZ mulGFNI_7x6_64_loop
VZEROUPPER
mulGFNI_7x6_64_end:
RET
// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x6(SB), $8-88
// Loading 8 of 42 tables to registers
// Destination kept in GP registers
// Full registers estimated 50 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), AX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R15
MOVQ 120(R10), R10
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R10
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_7x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 6 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_7x6_loop
VZEROUPPER
mulAvxGFNI_7x6_end:
RET
// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x6_64Xor(SB), $8-88
// Loading 24 of 42 tables to registers
// Destination kept in GP registers
// Full registers estimated 50 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), AX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R15
MOVQ 120(R10), R10
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R10
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_7x6_64Xor_loop:
// Load 6 outputs
VMOVDQU64 (R11), Z24
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R10), Z29
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 6 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
VMOVDQU64 Z24, (R11)
ADDQ $0x40, R11
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R10)
ADDQ $0x40, R10
// Prepare for next loop
DECQ BP
JNZ mulGFNI_7x6_64Xor_loop
VZEROUPPER
mulGFNI_7x6_64Xor_end:
RET
// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88
// Loading 8 of 42 tables to registers
// Destination kept in GP registers
// Full registers estimated 50 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), AX
MOVQ out_base+48(FP), R10
MOVQ out_base+48(FP), R10
MOVQ (R10), R11
MOVQ 24(R10), R12
MOVQ 48(R10), R13
MOVQ 72(R10), R14
MOVQ 96(R10), R15
MOVQ 120(R10), R10
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R10
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_7x6Xor_loop:
// Load 6 outputs
VMOVDQU (R11), Y8
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R10), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 6 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R10)
ADDQ $0x20, R10
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_7x6Xor_loop
VZEROUPPER
mulAvxGFNI_7x6Xor_end:
RET
// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x7_64(SB), $0-88
// Loading 23 of 49 tables to registers
// Destination kept on stack
// Full registers estimated 58 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulGFNI_7x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
MOVQ (R11), R13
VMOVDQU64 Z23, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU64 Z24, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU64 Z25, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU64 Z26, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU64 Z27, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU64 Z28, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU64 Z29, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x40, R12
DECQ AX
JNZ mulGFNI_7x7_64_loop
VZEROUPPER
mulGFNI_7x7_64_end:
RET
// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x7(SB), $0-88
// Loading 7 of 49 tables to registers
// Destination kept on stack
// Full registers estimated 58 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulAvxGFNI_7x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
MOVQ (R11), R13
VMOVDQU Y7, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU Y8, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU Y9, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU Y10, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU Y11, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU Y12, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU Y13, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x20, R12
DECQ AX
JNZ mulAvxGFNI_7x7_loop
VZEROUPPER
mulAvxGFNI_7x7_end:
RET
// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x7_64Xor(SB), $0-88
// Loading 23 of 49 tables to registers
// Destination kept on stack
// Full registers estimated 58 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulGFNI_7x7_64Xor_loop:
// Load 7 outputs
MOVQ (R11), R13
VMOVDQU64 (R13)(R12*1), Z23
MOVQ 24(R11), R13
VMOVDQU64 (R13)(R12*1), Z24
MOVQ 48(R11), R13
VMOVDQU64 (R13)(R12*1), Z25
MOVQ 72(R11), R13
VMOVDQU64 (R13)(R12*1), Z26
MOVQ 96(R11), R13
VMOVDQU64 (R13)(R12*1), Z27
MOVQ 120(R11), R13
VMOVDQU64 (R13)(R12*1), Z28
MOVQ 144(R11), R13
VMOVDQU64 (R13)(R12*1), Z29
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
MOVQ (R11), R13
VMOVDQU64 Z23, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU64 Z24, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU64 Z25, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU64 Z26, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU64 Z27, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU64 Z28, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU64 Z29, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x40, R12
DECQ AX
JNZ mulGFNI_7x7_64Xor_loop
VZEROUPPER
mulGFNI_7x7_64Xor_end:
RET
// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88
// Loading 7 of 49 tables to registers
// Destination kept on stack
// Full registers estimated 58 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulAvxGFNI_7x7Xor_loop:
// Load 7 outputs
MOVQ (R11), R13
VMOVDQU (R13)(R12*1), Y7
MOVQ 24(R11), R13
VMOVDQU (R13)(R12*1), Y8
MOVQ 48(R11), R13
VMOVDQU (R13)(R12*1), Y9
MOVQ 72(R11), R13
VMOVDQU (R13)(R12*1), Y10
MOVQ 96(R11), R13
VMOVDQU (R13)(R12*1), Y11
MOVQ 120(R11), R13
VMOVDQU (R13)(R12*1), Y12
MOVQ 144(R11), R13
VMOVDQU (R13)(R12*1), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
MOVQ (R11), R13
VMOVDQU Y7, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU Y8, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU Y9, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU Y10, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU Y11, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU Y12, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU Y13, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x20, R12
DECQ AX
JNZ mulAvxGFNI_7x7Xor_loop
VZEROUPPER
mulAvxGFNI_7x7Xor_end:
RET
// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x8_64(SB), $0-88
// Loading 22 of 56 tables to registers
// Destination kept on stack
// Full registers estimated 66 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulGFNI_7x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R11), R13
VMOVDQU64 Z22, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU64 Z23, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU64 Z24, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU64 Z25, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU64 Z26, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU64 Z27, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU64 Z28, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU64 Z29, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x40, R12
DECQ AX
JNZ mulGFNI_7x8_64_loop
VZEROUPPER
mulGFNI_7x8_64_end:
RET
// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x8(SB), $0-88
// Loading 6 of 56 tables to registers
// Destination kept on stack
// Full registers estimated 66 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulAvxGFNI_7x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R11), R13
VMOVDQU Y6, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU Y7, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU Y8, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU Y9, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU Y10, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU Y11, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU Y12, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU Y13, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x20, R12
DECQ AX
JNZ mulAvxGFNI_7x8_loop
VZEROUPPER
mulAvxGFNI_7x8_end:
RET
// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x8_64Xor(SB), $0-88
// Loading 22 of 56 tables to registers
// Destination kept on stack
// Full registers estimated 66 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulGFNI_7x8_64Xor_loop:
// Load 8 outputs
MOVQ (R11), R13
VMOVDQU64 (R13)(R12*1), Z22
MOVQ 24(R11), R13
VMOVDQU64 (R13)(R12*1), Z23
MOVQ 48(R11), R13
VMOVDQU64 (R13)(R12*1), Z24
MOVQ 72(R11), R13
VMOVDQU64 (R13)(R12*1), Z25
MOVQ 96(R11), R13
VMOVDQU64 (R13)(R12*1), Z26
MOVQ 120(R11), R13
VMOVDQU64 (R13)(R12*1), Z27
MOVQ 144(R11), R13
VMOVDQU64 (R13)(R12*1), Z28
MOVQ 168(R11), R13
VMOVDQU64 (R13)(R12*1), Z29
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R11), R13
VMOVDQU64 Z22, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU64 Z23, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU64 Z24, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU64 Z25, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU64 Z26, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU64 Z27, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU64 Z28, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU64 Z29, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x40, R12
DECQ AX
JNZ mulGFNI_7x8_64Xor_loop
VZEROUPPER
mulGFNI_7x8_64Xor_end:
RET
// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88
// Loading 6 of 56 tables to registers
// Destination kept on stack
// Full registers estimated 66 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulAvxGFNI_7x8Xor_loop:
// Load 8 outputs
MOVQ (R11), R13
VMOVDQU (R13)(R12*1), Y6
MOVQ 24(R11), R13
VMOVDQU (R13)(R12*1), Y7
MOVQ 48(R11), R13
VMOVDQU (R13)(R12*1), Y8
MOVQ 72(R11), R13
VMOVDQU (R13)(R12*1), Y9
MOVQ 96(R11), R13
VMOVDQU (R13)(R12*1), Y10
MOVQ 120(R11), R13
VMOVDQU (R13)(R12*1), Y11
MOVQ 144(R11), R13
VMOVDQU (R13)(R12*1), Y12
MOVQ 168(R11), R13
VMOVDQU (R13)(R12*1), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R11), R13
VMOVDQU Y6, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU Y7, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU Y8, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU Y9, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU Y10, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU Y11, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU Y12, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU Y13, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x20, R12
DECQ AX
JNZ mulAvxGFNI_7x8Xor_loop
VZEROUPPER
mulAvxGFNI_7x8Xor_end:
RET
// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x9_64(SB), $0-88
// Loading 21 of 63 tables to registers
// Destination kept on stack
// Full registers estimated 74 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulGFNI_7x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R11), R13
VMOVDQU64 Z21, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU64 Z22, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU64 Z23, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU64 Z24, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU64 Z25, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU64 Z26, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU64 Z27, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU64 Z28, (R13)(R12*1)
MOVQ 192(R11), R13
VMOVDQU64 Z29, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x40, R12
DECQ AX
JNZ mulGFNI_7x9_64_loop
VZEROUPPER
mulGFNI_7x9_64_end:
RET
// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x9(SB), $0-88
// Loading 5 of 63 tables to registers
// Destination kept on stack
// Full registers estimated 74 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulAvxGFNI_7x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R11), R13
VMOVDQU Y5, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU Y6, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU Y7, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU Y8, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU Y9, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU Y10, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU Y11, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU Y12, (R13)(R12*1)
MOVQ 192(R11), R13
VMOVDQU Y13, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x20, R12
DECQ AX
JNZ mulAvxGFNI_7x9_loop
VZEROUPPER
mulAvxGFNI_7x9_end:
RET
// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x9_64Xor(SB), $0-88
// Loading 21 of 63 tables to registers
// Destination kept on stack
// Full registers estimated 74 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulGFNI_7x9_64Xor_loop:
// Load 9 outputs
MOVQ (R11), R13
VMOVDQU64 (R13)(R12*1), Z21
MOVQ 24(R11), R13
VMOVDQU64 (R13)(R12*1), Z22
MOVQ 48(R11), R13
VMOVDQU64 (R13)(R12*1), Z23
MOVQ 72(R11), R13
VMOVDQU64 (R13)(R12*1), Z24
MOVQ 96(R11), R13
VMOVDQU64 (R13)(R12*1), Z25
MOVQ 120(R11), R13
VMOVDQU64 (R13)(R12*1), Z26
MOVQ 144(R11), R13
VMOVDQU64 (R13)(R12*1), Z27
MOVQ 168(R11), R13
VMOVDQU64 (R13)(R12*1), Z28
MOVQ 192(R11), R13
VMOVDQU64 (R13)(R12*1), Z29
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R11), R13
VMOVDQU64 Z21, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU64 Z22, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU64 Z23, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU64 Z24, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU64 Z25, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU64 Z26, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU64 Z27, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU64 Z28, (R13)(R12*1)
MOVQ 192(R11), R13
VMOVDQU64 Z29, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x40, R12
DECQ AX
JNZ mulGFNI_7x9_64Xor_loop
VZEROUPPER
mulGFNI_7x9_64Xor_end:
RET
// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88
// Loading 5 of 63 tables to registers
// Destination kept on stack
// Full registers estimated 74 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulAvxGFNI_7x9Xor_loop:
// Load 9 outputs
MOVQ (R11), R13
VMOVDQU (R13)(R12*1), Y5
MOVQ 24(R11), R13
VMOVDQU (R13)(R12*1), Y6
MOVQ 48(R11), R13
VMOVDQU (R13)(R12*1), Y7
MOVQ 72(R11), R13
VMOVDQU (R13)(R12*1), Y8
MOVQ 96(R11), R13
VMOVDQU (R13)(R12*1), Y9
MOVQ 120(R11), R13
VMOVDQU (R13)(R12*1), Y10
MOVQ 144(R11), R13
VMOVDQU (R13)(R12*1), Y11
MOVQ 168(R11), R13
VMOVDQU (R13)(R12*1), Y12
MOVQ 192(R11), R13
VMOVDQU (R13)(R12*1), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R11), R13
VMOVDQU Y5, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU Y6, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU Y7, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU Y8, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU Y9, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU Y10, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU Y11, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU Y12, (R13)(R12*1)
MOVQ 192(R11), R13
VMOVDQU Y13, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x20, R12
DECQ AX
JNZ mulAvxGFNI_7x9Xor_loop
VZEROUPPER
mulAvxGFNI_7x9Xor_end:
RET
// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x10_64(SB), $0-88
// Loading 20 of 70 tables to registers
// Destination kept on stack
// Full registers estimated 82 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulGFNI_7x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R11), R13
VMOVDQU64 Z20, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU64 Z21, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU64 Z22, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU64 Z23, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU64 Z24, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU64 Z25, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU64 Z26, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU64 Z27, (R13)(R12*1)
MOVQ 192(R11), R13
VMOVDQU64 Z28, (R13)(R12*1)
MOVQ 216(R11), R13
VMOVDQU64 Z29, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x40, R12
DECQ AX
JNZ mulGFNI_7x10_64_loop
VZEROUPPER
mulGFNI_7x10_64_end:
RET
// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x10(SB), $0-88
// Loading 4 of 70 tables to registers
// Destination kept on stack
// Full registers estimated 82 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulAvxGFNI_7x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R11), R13
VMOVDQU Y4, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU Y5, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU Y6, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU Y7, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU Y8, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU Y9, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU Y10, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU Y11, (R13)(R12*1)
MOVQ 192(R11), R13
VMOVDQU Y12, (R13)(R12*1)
MOVQ 216(R11), R13
VMOVDQU Y13, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x20, R12
DECQ AX
JNZ mulAvxGFNI_7x10_loop
VZEROUPPER
mulAvxGFNI_7x10_end:
RET
// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_7x10_64Xor(SB), $0-88
// Loading 20 of 70 tables to registers
// Destination kept on stack
// Full registers estimated 82 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_7x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulGFNI_7x10_64Xor_loop:
// Load 10 outputs
MOVQ (R11), R13
VMOVDQU64 (R13)(R12*1), Z20
MOVQ 24(R11), R13
VMOVDQU64 (R13)(R12*1), Z21
MOVQ 48(R11), R13
VMOVDQU64 (R13)(R12*1), Z22
MOVQ 72(R11), R13
VMOVDQU64 (R13)(R12*1), Z23
MOVQ 96(R11), R13
VMOVDQU64 (R13)(R12*1), Z24
MOVQ 120(R11), R13
VMOVDQU64 (R13)(R12*1), Z25
MOVQ 144(R11), R13
VMOVDQU64 (R13)(R12*1), Z26
MOVQ 168(R11), R13
VMOVDQU64 (R13)(R12*1), Z27
MOVQ 192(R11), R13
VMOVDQU64 (R13)(R12*1), Z28
MOVQ 216(R11), R13
VMOVDQU64 (R13)(R12*1), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R11), R13
VMOVDQU64 Z20, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU64 Z21, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU64 Z22, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU64 Z23, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU64 Z24, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU64 Z25, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU64 Z26, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU64 Z27, (R13)(R12*1)
MOVQ 192(R11), R13
VMOVDQU64 Z28, (R13)(R12*1)
MOVQ 216(R11), R13
VMOVDQU64 Z29, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x40, R12
DECQ AX
JNZ mulGFNI_7x10_64Xor_loop
VZEROUPPER
mulGFNI_7x10_64Xor_end:
RET
// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88
// Loading 4 of 70 tables to registers
// Destination kept on stack
// Full registers estimated 82 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_7x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), DX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ start+72(FP), R12
// Add start offset to input
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, DX
mulAvxGFNI_7x10Xor_loop:
// Load 10 outputs
MOVQ (R11), R13
VMOVDQU (R13)(R12*1), Y4
MOVQ 24(R11), R13
VMOVDQU (R13)(R12*1), Y5
MOVQ 48(R11), R13
VMOVDQU (R13)(R12*1), Y6
MOVQ 72(R11), R13
VMOVDQU (R13)(R12*1), Y7
MOVQ 96(R11), R13
VMOVDQU (R13)(R12*1), Y8
MOVQ 120(R11), R13
VMOVDQU (R13)(R12*1), Y9
MOVQ 144(R11), R13
VMOVDQU (R13)(R12*1), Y10
MOVQ 168(R11), R13
VMOVDQU (R13)(R12*1), Y11
MOVQ 192(R11), R13
VMOVDQU (R13)(R12*1), Y12
MOVQ 216(R11), R13
VMOVDQU (R13)(R12*1), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R11), R13
VMOVDQU Y4, (R13)(R12*1)
MOVQ 24(R11), R13
VMOVDQU Y5, (R13)(R12*1)
MOVQ 48(R11), R13
VMOVDQU Y6, (R13)(R12*1)
MOVQ 72(R11), R13
VMOVDQU Y7, (R13)(R12*1)
MOVQ 96(R11), R13
VMOVDQU Y8, (R13)(R12*1)
MOVQ 120(R11), R13
VMOVDQU Y9, (R13)(R12*1)
MOVQ 144(R11), R13
VMOVDQU Y10, (R13)(R12*1)
MOVQ 168(R11), R13
VMOVDQU Y11, (R13)(R12*1)
MOVQ 192(R11), R13
VMOVDQU Y12, (R13)(R12*1)
MOVQ 216(R11), R13
VMOVDQU Y13, (R13)(R12*1)
// Prepare for next loop
ADDQ $0x20, R12
DECQ AX
JNZ mulAvxGFNI_7x10Xor_loop
VZEROUPPER
mulAvxGFNI_7x10Xor_end:
RET
// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 11 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), CX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R11
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R11
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, CX
mulGFNI_8x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z9
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z9, Z8
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z9
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z9
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z9
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z9
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (R9), Z9
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z5, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 6 to 1 outputs
VMOVDQU64 (R10), Z9
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z6, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 7 to 1 outputs
VMOVDQU64 (CX), Z9
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z7, Z9, Z9
VXORPD Z8, Z9, Z8
// Store 1 outputs
VMOVDQU64 Z8, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_8x1_64_loop
VZEROUPPER
mulGFNI_8x1_64_end:
RET
// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 11 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), CX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R11
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R11
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, CX
mulAvxGFNI_8x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y9
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y9, Y8
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y9
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y9
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y9
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y9
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (R9), Y9
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y5, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 6 to 1 outputs
VMOVDQU (R10), Y9
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y6, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 7 to 1 outputs
VMOVDQU (CX), Y9
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y7, Y9, Y9
VXORPD Y8, Y9, Y8
// Store 1 outputs
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_8x1_loop
VZEROUPPER
mulAvxGFNI_8x1_end:
RET
// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 11 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), CX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R11
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R11
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, CX
mulGFNI_8x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (R11), Z8
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z9
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z9
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z9
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z9
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z9
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (R9), Z9
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z5, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 6 to 1 outputs
VMOVDQU64 (R10), Z9
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z6, Z9, Z9
VXORPD Z8, Z9, Z8
// Load and process 64 bytes from input 7 to 1 outputs
VMOVDQU64 (CX), Z9
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z7, Z9, Z9
VXORPD Z8, Z9, Z8
// Store 1 outputs
VMOVDQU64 Z8, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_8x1_64Xor_loop
VZEROUPPER
mulGFNI_8x1_64Xor_end:
RET
// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 11 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), CX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R11
MOVQ start+72(FP), R12
// Add start offset to output
ADDQ R12, R11
// Add start offset to input
ADDQ R12, DX
ADDQ R12, BX
ADDQ R12, SI
ADDQ R12, DI
ADDQ R12, R8
ADDQ R12, R9
ADDQ R12, R10
ADDQ R12, CX
mulAvxGFNI_8x1Xor_loop:
// Load 1 outputs
VMOVDQU (R11), Y8
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y9
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y9
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y9
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y9
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y9
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (R9), Y9
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y5, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 6 to 1 outputs
VMOVDQU (R10), Y9
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y6, Y9, Y9
VXORPD Y8, Y9, Y8
// Load and process 32 bytes from input 7 to 1 outputs
VMOVDQU (CX), Y9
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y7, Y9, Y9
VXORPD Y8, Y9, Y8
// Store 1 outputs
VMOVDQU Y8, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_8x1Xor_loop
VZEROUPPER
mulAvxGFNI_8x1Xor_end:
RET
// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), CX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R11
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R12
ADDQ R13, R11
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, CX
mulGFNI_8x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z18
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z18, Z16
VGF2P8AFFINEQB $0x00, Z1, Z18, Z17
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z18
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z18
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z18
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z18
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (R9), Z18
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 6 to 2 outputs
VMOVDQU64 (R10), Z18
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 7 to 2 outputs
VMOVDQU64 (CX), Z18
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z15, Z18, Z19
VXORPD Z17, Z19, Z17
// Store 2 outputs
VMOVDQU64 Z16, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z17, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_8x2_64_loop
VZEROUPPER
mulGFNI_8x2_64_end:
RET
// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x2(SB), $0-88
// Loading 12 of 16 tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R12
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R13
ADDQ R14, R12
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, DX
mulAvxGFNI_8x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 2 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_8x2_loop
VZEROUPPER
mulAvxGFNI_8x2_end:
RET
// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), CX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R11
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R12
ADDQ R13, R11
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, CX
mulGFNI_8x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (R12), Z16
VMOVDQU64 (R11), Z17
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z18
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z1, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z18
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z3, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z18
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z5, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z18
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z7, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z18
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z9, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (R9), Z18
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z10, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z11, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 6 to 2 outputs
VMOVDQU64 (R10), Z18
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z12, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z13, Z18, Z19
VXORPD Z17, Z19, Z17
// Load and process 64 bytes from input 7 to 2 outputs
VMOVDQU64 (CX), Z18
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z14, Z18, Z19
VXORPD Z16, Z19, Z16
VGF2P8AFFINEQB $0x00, Z15, Z18, Z19
VXORPD Z17, Z19, Z17
// Store 2 outputs
VMOVDQU64 Z16, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z17, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_8x2_64Xor_loop
VZEROUPPER
mulGFNI_8x2_64Xor_end:
RET
// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88
// Loading 12 of 16 tables to registers
// Destination kept in GP registers
// Full registers estimated 20 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R12
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R13
ADDQ R14, R12
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, DX
mulAvxGFNI_8x2Xor_loop:
// Load 2 outputs
VMOVDQU (R13), Y12
VMOVDQU (R12), Y13
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 2 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R13)
ADDQ $0x20, R13
VMOVDQU Y13, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_8x2Xor_loop
VZEROUPPER
mulAvxGFNI_8x2Xor_end:
RET
// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 29 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), CX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R11
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R11
// Add start offset to input
ADDQ R14, DX
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, CX
mulGFNI_8x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z27
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z27, Z24
VGF2P8AFFINEQB $0x00, Z1, Z27, Z25
VGF2P8AFFINEQB $0x00, Z2, Z27, Z26
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z27
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z27
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z27
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z27
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (R9), Z27
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 6 to 3 outputs
VMOVDQU64 (R10), Z27
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z18, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z19, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z20, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 7 to 3 outputs
VMOVDQU64 (CX), Z27
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z21, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z22, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z23, Z27, Z28
VXORPD Z26, Z28, Z26
// Store 3 outputs
VMOVDQU64 Z24, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z25, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z26, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_8x3_64_loop
VZEROUPPER
mulGFNI_8x3_64_end:
RET
// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x3(SB), $0-88
// Loading 11 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 29 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R12
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R12
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, DX
mulAvxGFNI_8x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 3 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_8x3_loop
VZEROUPPER
mulAvxGFNI_8x3_end:
RET
// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 29 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), CX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R11
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R12
ADDQ R14, R13
ADDQ R14, R11
// Add start offset to input
ADDQ R14, DX
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, CX
mulGFNI_8x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (R12), Z24
VMOVDQU64 (R13), Z25
VMOVDQU64 (R11), Z26
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z27
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z1, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z2, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z27
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z4, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z5, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z27
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z7, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z8, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z27
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z10, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z11, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z27
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z13, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z14, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (R9), Z27
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z15, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z16, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z17, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 6 to 3 outputs
VMOVDQU64 (R10), Z27
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z18, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z19, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z20, Z27, Z28
VXORPD Z26, Z28, Z26
// Load and process 64 bytes from input 7 to 3 outputs
VMOVDQU64 (CX), Z27
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z21, Z27, Z28
VXORPD Z24, Z28, Z24
VGF2P8AFFINEQB $0x00, Z22, Z27, Z28
VXORPD Z25, Z28, Z25
VGF2P8AFFINEQB $0x00, Z23, Z27, Z28
VXORPD Z26, Z28, Z26
// Store 3 outputs
VMOVDQU64 Z24, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z25, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z26, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ AX
JNZ mulGFNI_8x3_64Xor_loop
VZEROUPPER
mulGFNI_8x3_64Xor_end:
RET
// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88
// Loading 11 of 24 tables to registers
// Destination kept in GP registers
// Full registers estimated 29 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R12
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R12
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, DX
mulAvxGFNI_8x3Xor_loop:
// Load 3 outputs
VMOVDQU (R13), Y11
VMOVDQU (R14), Y12
VMOVDQU (R12), Y13
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 3 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R13)
ADDQ $0x20, R13
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_8x3Xor_loop
VZEROUPPER
mulAvxGFNI_8x3Xor_end:
RET
// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x4_64(SB), $8-88
// Loading 26 of 32 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R15
MOVQ 72(R12), R12
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R12
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, DX
mulGFNI_8x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 4 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 4 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 4 outputs
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ AX
JNZ mulGFNI_8x4_64_loop
VZEROUPPER
mulGFNI_8x4_64_end:
RET
// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x4(SB), $8-88
// Loading 10 of 32 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R15
MOVQ 72(R12), R12
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R12
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, DX
mulAvxGFNI_8x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 4 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_8x4_loop
VZEROUPPER
mulAvxGFNI_8x4_end:
RET
// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x4_64Xor(SB), $8-88
// Loading 26 of 32 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R15
MOVQ 72(R12), R12
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R12
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, DX
mulGFNI_8x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R12), Z29
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 4 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 4 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 4 outputs
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ AX
JNZ mulGFNI_8x4_64Xor_loop
VZEROUPPER
mulGFNI_8x4_64Xor_end:
RET
// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88
// Loading 10 of 32 tables to registers
// Destination kept in GP registers
// Full registers estimated 38 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R15
MOVQ 72(R12), R12
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R12
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, DX
mulAvxGFNI_8x4Xor_loop:
// Load 4 outputs
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R12), Y13
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 4 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_8x4Xor_loop
VZEROUPPER
mulAvxGFNI_8x4Xor_end:
RET
// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x5_64(SB), $8-88
// Loading 25 of 40 tables to registers
// Destination kept in GP registers
// Full registers estimated 47 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), AX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R15
MOVQ 96(R11), R11
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R11
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_8x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 5 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 5 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ BP
JNZ mulGFNI_8x5_64_loop
VZEROUPPER
mulGFNI_8x5_64_end:
RET
// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x5(SB), $8-88
// Loading 9 of 40 tables to registers
// Destination kept in GP registers
// Full registers estimated 47 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), AX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R15
MOVQ 96(R11), R11
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R11
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_8x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 5 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 5 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_8x5_loop
VZEROUPPER
mulAvxGFNI_8x5_end:
RET
// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x5_64Xor(SB), $8-88
// Loading 25 of 40 tables to registers
// Destination kept in GP registers
// Full registers estimated 47 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), AX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R15
MOVQ 96(R11), R11
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R11
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_8x5_64Xor_loop:
// Load 5 outputs
VMOVDQU64 (R12), Z25
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R11), Z29
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 5 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 5 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
VMOVDQU64 Z25, (R12)
ADDQ $0x40, R12
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R11)
ADDQ $0x40, R11
// Prepare for next loop
DECQ BP
JNZ mulGFNI_8x5_64Xor_loop
VZEROUPPER
mulGFNI_8x5_64Xor_end:
RET
// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88
// Loading 9 of 40 tables to registers
// Destination kept in GP registers
// Full registers estimated 47 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), AX
MOVQ out_base+48(FP), R11
MOVQ out_base+48(FP), R11
MOVQ (R11), R12
MOVQ 24(R11), R13
MOVQ 48(R11), R14
MOVQ 72(R11), R15
MOVQ 96(R11), R11
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R11
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_8x5Xor_loop:
// Load 5 outputs
VMOVDQU (R12), Y9
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R11), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 5 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 5 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R11)
ADDQ $0x20, R11
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_8x5Xor_loop
VZEROUPPER
mulAvxGFNI_8x5Xor_end:
RET
// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x6_64(SB), $0-88
// Loading 24 of 48 tables to registers
// Destination kept on stack
// Full registers estimated 56 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 6 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
MOVQ (R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x6_64_loop
VZEROUPPER
mulGFNI_8x6_64_end:
RET
// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x6(SB), $0-88
// Loading 8 of 48 tables to registers
// Destination kept on stack
// Full registers estimated 56 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 6 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
MOVQ (R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x6_loop
VZEROUPPER
mulAvxGFNI_8x6_end:
RET
// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x6_64Xor(SB), $0-88
// Loading 24 of 48 tables to registers
// Destination kept on stack
// Full registers estimated 56 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x6_64Xor_loop:
// Load 6 outputs
MOVQ (R12), R14
VMOVDQU64 (R14)(R13*1), Z24
MOVQ 24(R12), R14
VMOVDQU64 (R14)(R13*1), Z25
MOVQ 48(R12), R14
VMOVDQU64 (R14)(R13*1), Z26
MOVQ 72(R12), R14
VMOVDQU64 (R14)(R13*1), Z27
MOVQ 96(R12), R14
VMOVDQU64 (R14)(R13*1), Z28
MOVQ 120(R12), R14
VMOVDQU64 (R14)(R13*1), Z29
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 6 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
MOVQ (R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x6_64Xor_loop
VZEROUPPER
mulGFNI_8x6_64Xor_end:
RET
// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88
// Loading 8 of 48 tables to registers
// Destination kept on stack
// Full registers estimated 56 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x6Xor_loop:
// Load 6 outputs
MOVQ (R12), R14
VMOVDQU (R14)(R13*1), Y8
MOVQ 24(R12), R14
VMOVDQU (R14)(R13*1), Y9
MOVQ 48(R12), R14
VMOVDQU (R14)(R13*1), Y10
MOVQ 72(R12), R14
VMOVDQU (R14)(R13*1), Y11
MOVQ 96(R12), R14
VMOVDQU (R14)(R13*1), Y12
MOVQ 120(R12), R14
VMOVDQU (R14)(R13*1), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 6 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
MOVQ (R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x6Xor_loop
VZEROUPPER
mulAvxGFNI_8x6Xor_end:
RET
// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x7_64(SB), $0-88
// Loading 23 of 56 tables to registers
// Destination kept on stack
// Full registers estimated 65 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 7 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
MOVQ (R12), R14
VMOVDQU64 Z23, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x7_64_loop
VZEROUPPER
mulGFNI_8x7_64_end:
RET
// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x7(SB), $0-88
// Loading 7 of 56 tables to registers
// Destination kept on stack
// Full registers estimated 65 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 7 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
MOVQ (R12), R14
VMOVDQU Y7, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x7_loop
VZEROUPPER
mulAvxGFNI_8x7_end:
RET
// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x7_64Xor(SB), $0-88
// Loading 23 of 56 tables to registers
// Destination kept on stack
// Full registers estimated 65 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x7_64Xor_loop:
// Load 7 outputs
MOVQ (R12), R14
VMOVDQU64 (R14)(R13*1), Z23
MOVQ 24(R12), R14
VMOVDQU64 (R14)(R13*1), Z24
MOVQ 48(R12), R14
VMOVDQU64 (R14)(R13*1), Z25
MOVQ 72(R12), R14
VMOVDQU64 (R14)(R13*1), Z26
MOVQ 96(R12), R14
VMOVDQU64 (R14)(R13*1), Z27
MOVQ 120(R12), R14
VMOVDQU64 (R14)(R13*1), Z28
MOVQ 144(R12), R14
VMOVDQU64 (R14)(R13*1), Z29
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 7 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
MOVQ (R12), R14
VMOVDQU64 Z23, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x7_64Xor_loop
VZEROUPPER
mulGFNI_8x7_64Xor_end:
RET
// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88
// Loading 7 of 56 tables to registers
// Destination kept on stack
// Full registers estimated 65 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x7Xor_loop:
// Load 7 outputs
MOVQ (R12), R14
VMOVDQU (R14)(R13*1), Y7
MOVQ 24(R12), R14
VMOVDQU (R14)(R13*1), Y8
MOVQ 48(R12), R14
VMOVDQU (R14)(R13*1), Y9
MOVQ 72(R12), R14
VMOVDQU (R14)(R13*1), Y10
MOVQ 96(R12), R14
VMOVDQU (R14)(R13*1), Y11
MOVQ 120(R12), R14
VMOVDQU (R14)(R13*1), Y12
MOVQ 144(R12), R14
VMOVDQU (R14)(R13*1), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 7 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
MOVQ (R12), R14
VMOVDQU Y7, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x7Xor_loop
VZEROUPPER
mulAvxGFNI_8x7Xor_end:
RET
// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x8_64(SB), $0-88
// Loading 22 of 64 tables to registers
// Destination kept on stack
// Full registers estimated 74 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 8 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R12), R14
VMOVDQU64 Z22, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z23, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x8_64_loop
VZEROUPPER
mulGFNI_8x8_64_end:
RET
// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x8(SB), $0-88
// Loading 6 of 64 tables to registers
// Destination kept on stack
// Full registers estimated 74 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 8 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R12), R14
VMOVDQU Y6, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y7, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x8_loop
VZEROUPPER
mulAvxGFNI_8x8_end:
RET
// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x8_64Xor(SB), $0-88
// Loading 22 of 64 tables to registers
// Destination kept on stack
// Full registers estimated 74 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x8_64Xor_loop:
// Load 8 outputs
MOVQ (R12), R14
VMOVDQU64 (R14)(R13*1), Z22
MOVQ 24(R12), R14
VMOVDQU64 (R14)(R13*1), Z23
MOVQ 48(R12), R14
VMOVDQU64 (R14)(R13*1), Z24
MOVQ 72(R12), R14
VMOVDQU64 (R14)(R13*1), Z25
MOVQ 96(R12), R14
VMOVDQU64 (R14)(R13*1), Z26
MOVQ 120(R12), R14
VMOVDQU64 (R14)(R13*1), Z27
MOVQ 144(R12), R14
VMOVDQU64 (R14)(R13*1), Z28
MOVQ 168(R12), R14
VMOVDQU64 (R14)(R13*1), Z29
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 8 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R12), R14
VMOVDQU64 Z22, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z23, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x8_64Xor_loop
VZEROUPPER
mulGFNI_8x8_64Xor_end:
RET
// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88
// Loading 6 of 64 tables to registers
// Destination kept on stack
// Full registers estimated 74 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x8Xor_loop:
// Load 8 outputs
MOVQ (R12), R14
VMOVDQU (R14)(R13*1), Y6
MOVQ 24(R12), R14
VMOVDQU (R14)(R13*1), Y7
MOVQ 48(R12), R14
VMOVDQU (R14)(R13*1), Y8
MOVQ 72(R12), R14
VMOVDQU (R14)(R13*1), Y9
MOVQ 96(R12), R14
VMOVDQU (R14)(R13*1), Y10
MOVQ 120(R12), R14
VMOVDQU (R14)(R13*1), Y11
MOVQ 144(R12), R14
VMOVDQU (R14)(R13*1), Y12
MOVQ 168(R12), R14
VMOVDQU (R14)(R13*1), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 8 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R12), R14
VMOVDQU Y6, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y7, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x8Xor_loop
VZEROUPPER
mulAvxGFNI_8x8Xor_end:
RET
// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x9_64(SB), $0-88
// Loading 21 of 72 tables to registers
// Destination kept on stack
// Full registers estimated 83 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 9 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R12), R14
VMOVDQU64 Z21, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z22, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z23, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 192(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x9_64_loop
VZEROUPPER
mulGFNI_8x9_64_end:
RET
// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x9(SB), $0-88
// Loading 5 of 72 tables to registers
// Destination kept on stack
// Full registers estimated 83 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 9 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R12), R14
VMOVDQU Y5, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y6, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y7, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 192(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x9_loop
VZEROUPPER
mulAvxGFNI_8x9_end:
RET
// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x9_64Xor(SB), $0-88
// Loading 21 of 72 tables to registers
// Destination kept on stack
// Full registers estimated 83 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x9_64Xor_loop:
// Load 9 outputs
MOVQ (R12), R14
VMOVDQU64 (R14)(R13*1), Z21
MOVQ 24(R12), R14
VMOVDQU64 (R14)(R13*1), Z22
MOVQ 48(R12), R14
VMOVDQU64 (R14)(R13*1), Z23
MOVQ 72(R12), R14
VMOVDQU64 (R14)(R13*1), Z24
MOVQ 96(R12), R14
VMOVDQU64 (R14)(R13*1), Z25
MOVQ 120(R12), R14
VMOVDQU64 (R14)(R13*1), Z26
MOVQ 144(R12), R14
VMOVDQU64 (R14)(R13*1), Z27
MOVQ 168(R12), R14
VMOVDQU64 (R14)(R13*1), Z28
MOVQ 192(R12), R14
VMOVDQU64 (R14)(R13*1), Z29
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 9 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R12), R14
VMOVDQU64 Z21, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z22, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z23, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 192(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x9_64Xor_loop
VZEROUPPER
mulGFNI_8x9_64Xor_end:
RET
// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88
// Loading 5 of 72 tables to registers
// Destination kept on stack
// Full registers estimated 83 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x9Xor_loop:
// Load 9 outputs
MOVQ (R12), R14
VMOVDQU (R14)(R13*1), Y5
MOVQ 24(R12), R14
VMOVDQU (R14)(R13*1), Y6
MOVQ 48(R12), R14
VMOVDQU (R14)(R13*1), Y7
MOVQ 72(R12), R14
VMOVDQU (R14)(R13*1), Y8
MOVQ 96(R12), R14
VMOVDQU (R14)(R13*1), Y9
MOVQ 120(R12), R14
VMOVDQU (R14)(R13*1), Y10
MOVQ 144(R12), R14
VMOVDQU (R14)(R13*1), Y11
MOVQ 168(R12), R14
VMOVDQU (R14)(R13*1), Y12
MOVQ 192(R12), R14
VMOVDQU (R14)(R13*1), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 9 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R12), R14
VMOVDQU Y5, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y6, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y7, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 192(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x9Xor_loop
VZEROUPPER
mulAvxGFNI_8x9Xor_end:
RET
// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x10_64(SB), $0-88
// Loading 20 of 80 tables to registers
// Destination kept on stack
// Full registers estimated 92 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 10 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R12), R14
VMOVDQU64 Z20, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z21, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z22, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z23, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 192(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 216(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x10_64_loop
VZEROUPPER
mulGFNI_8x10_64_end:
RET
// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x10(SB), $0-88
// Loading 4 of 80 tables to registers
// Destination kept on stack
// Full registers estimated 92 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 10 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R12), R14
VMOVDQU Y4, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y5, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y6, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y7, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 192(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 216(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x10_loop
VZEROUPPER
mulAvxGFNI_8x10_end:
RET
// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_8x10_64Xor(SB), $0-88
// Loading 20 of 80 tables to registers
// Destination kept on stack
// Full registers estimated 92 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_8x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulGFNI_8x10_64Xor_loop:
// Load 10 outputs
MOVQ (R12), R14
VMOVDQU64 (R14)(R13*1), Z20
MOVQ 24(R12), R14
VMOVDQU64 (R14)(R13*1), Z21
MOVQ 48(R12), R14
VMOVDQU64 (R14)(R13*1), Z22
MOVQ 72(R12), R14
VMOVDQU64 (R14)(R13*1), Z23
MOVQ 96(R12), R14
VMOVDQU64 (R14)(R13*1), Z24
MOVQ 120(R12), R14
VMOVDQU64 (R14)(R13*1), Z25
MOVQ 144(R12), R14
VMOVDQU64 (R14)(R13*1), Z26
MOVQ 168(R12), R14
VMOVDQU64 (R14)(R13*1), Z27
MOVQ 192(R12), R14
VMOVDQU64 (R14)(R13*1), Z28
MOVQ 216(R12), R14
VMOVDQU64 (R14)(R13*1), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 10 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R12), R14
VMOVDQU64 Z20, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU64 Z21, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU64 Z22, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU64 Z23, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU64 Z24, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU64 Z25, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU64 Z26, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU64 Z27, (R14)(R13*1)
MOVQ 192(R12), R14
VMOVDQU64 Z28, (R14)(R13*1)
MOVQ 216(R12), R14
VMOVDQU64 Z29, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x40, R13
DECQ AX
JNZ mulGFNI_8x10_64Xor_loop
VZEROUPPER
mulGFNI_8x10_64Xor_end:
RET
// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88
// Loading 4 of 80 tables to registers
// Destination kept on stack
// Full registers estimated 92 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_8x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), DX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ start+72(FP), R13
// Add start offset to input
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, DX
mulAvxGFNI_8x10Xor_loop:
// Load 10 outputs
MOVQ (R12), R14
VMOVDQU (R14)(R13*1), Y4
MOVQ 24(R12), R14
VMOVDQU (R14)(R13*1), Y5
MOVQ 48(R12), R14
VMOVDQU (R14)(R13*1), Y6
MOVQ 72(R12), R14
VMOVDQU (R14)(R13*1), Y7
MOVQ 96(R12), R14
VMOVDQU (R14)(R13*1), Y8
MOVQ 120(R12), R14
VMOVDQU (R14)(R13*1), Y9
MOVQ 144(R12), R14
VMOVDQU (R14)(R13*1), Y10
MOVQ 168(R12), R14
VMOVDQU (R14)(R13*1), Y11
MOVQ 192(R12), R14
VMOVDQU (R14)(R13*1), Y12
MOVQ 216(R12), R14
VMOVDQU (R14)(R13*1), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 10 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R12), R14
VMOVDQU Y4, (R14)(R13*1)
MOVQ 24(R12), R14
VMOVDQU Y5, (R14)(R13*1)
MOVQ 48(R12), R14
VMOVDQU Y6, (R14)(R13*1)
MOVQ 72(R12), R14
VMOVDQU Y7, (R14)(R13*1)
MOVQ 96(R12), R14
VMOVDQU Y8, (R14)(R13*1)
MOVQ 120(R12), R14
VMOVDQU Y9, (R14)(R13*1)
MOVQ 144(R12), R14
VMOVDQU Y10, (R14)(R13*1)
MOVQ 168(R12), R14
VMOVDQU Y11, (R14)(R13*1)
MOVQ 192(R12), R14
VMOVDQU Y12, (R14)(R13*1)
MOVQ 216(R12), R14
VMOVDQU Y13, (R14)(R13*1)
// Prepare for next loop
ADDQ $0x20, R13
DECQ AX
JNZ mulAvxGFNI_8x10Xor_loop
VZEROUPPER
mulAvxGFNI_8x10Xor_end:
RET
// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), CX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R12
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R12
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, CX
mulGFNI_9x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z10
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z10, Z9
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z10
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z10
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z10
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z10
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (R9), Z10
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z5, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 6 to 1 outputs
VMOVDQU64 (R10), Z10
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z6, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 7 to 1 outputs
VMOVDQU64 (R11), Z10
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z7, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 8 to 1 outputs
VMOVDQU64 (CX), Z10
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z8, Z10, Z10
VXORPD Z9, Z10, Z9
// Store 1 outputs
VMOVDQU64 Z9, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ AX
JNZ mulGFNI_9x1_64_loop
VZEROUPPER
mulGFNI_9x1_64_end:
RET
// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), CX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R12
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R12
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, CX
mulAvxGFNI_9x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y10
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y10, Y9
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y10
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y10
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y10
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y10
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (R9), Y10
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y5, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 6 to 1 outputs
VMOVDQU (R10), Y10
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y6, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 7 to 1 outputs
VMOVDQU (R11), Y10
ADDQ $0x20, R11
VGF2P8AFFINEQB $0x00, Y7, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 8 to 1 outputs
VMOVDQU (CX), Y10
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y8, Y10, Y10
VXORPD Y9, Y10, Y9
// Store 1 outputs
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_9x1_loop
VZEROUPPER
mulAvxGFNI_9x1_end:
RET
// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), CX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R12
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R12
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, CX
mulGFNI_9x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (R12), Z9
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z10
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z10
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z10
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z10
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z10
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (R9), Z10
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z5, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 6 to 1 outputs
VMOVDQU64 (R10), Z10
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z6, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 7 to 1 outputs
VMOVDQU64 (R11), Z10
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z7, Z10, Z10
VXORPD Z9, Z10, Z9
// Load and process 64 bytes from input 8 to 1 outputs
VMOVDQU64 (CX), Z10
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z8, Z10, Z10
VXORPD Z9, Z10, Z9
// Store 1 outputs
VMOVDQU64 Z9, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ AX
JNZ mulGFNI_9x1_64Xor_loop
VZEROUPPER
mulGFNI_9x1_64Xor_end:
RET
// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 12 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), CX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R12
MOVQ start+72(FP), R13
// Add start offset to output
ADDQ R13, R12
// Add start offset to input
ADDQ R13, DX
ADDQ R13, BX
ADDQ R13, SI
ADDQ R13, DI
ADDQ R13, R8
ADDQ R13, R9
ADDQ R13, R10
ADDQ R13, R11
ADDQ R13, CX
mulAvxGFNI_9x1Xor_loop:
// Load 1 outputs
VMOVDQU (R12), Y9
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y10
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y10
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y10
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y10
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y10
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (R9), Y10
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y5, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 6 to 1 outputs
VMOVDQU (R10), Y10
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y6, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 7 to 1 outputs
VMOVDQU (R11), Y10
ADDQ $0x20, R11
VGF2P8AFFINEQB $0x00, Y7, Y10, Y10
VXORPD Y9, Y10, Y9
// Load and process 32 bytes from input 8 to 1 outputs
VMOVDQU (CX), Y10
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y8, Y10, Y10
VXORPD Y9, Y10, Y9
// Store 1 outputs
VMOVDQU Y9, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_9x1Xor_loop
VZEROUPPER
mulAvxGFNI_9x1Xor_end:
RET
// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), CX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R12
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R13
ADDQ R14, R12
// Add start offset to input
ADDQ R14, DX
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, CX
mulGFNI_9x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z20
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z20, Z18
VGF2P8AFFINEQB $0x00, Z1, Z20, Z19
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z20
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z20
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z20
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z20
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (R9), Z20
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 6 to 2 outputs
VMOVDQU64 (R10), Z20
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 7 to 2 outputs
VMOVDQU64 (R11), Z20
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 8 to 2 outputs
VMOVDQU64 (CX), Z20
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z16, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z17, Z20, Z21
VXORPD Z19, Z21, Z19
// Store 2 outputs
VMOVDQU64 Z18, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z19, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ AX
JNZ mulGFNI_9x2_64_loop
VZEROUPPER
mulGFNI_9x2_64_end:
RET
// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x2(SB), $0-88
// Loading 12 of 18 tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R13
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R14
ADDQ R15, R13
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, DX
mulAvxGFNI_9x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 2 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 2 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R13)
ADDQ $0x20, R13
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_9x2_loop
VZEROUPPER
mulAvxGFNI_9x2_end:
RET
// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), CX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R12
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R13
ADDQ R14, R12
// Add start offset to input
ADDQ R14, DX
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, CX
mulGFNI_9x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (R13), Z18
VMOVDQU64 (R12), Z19
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z20
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z1, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z20
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z3, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z20
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z5, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z20
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z7, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z20
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z9, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (R9), Z20
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z10, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z11, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 6 to 2 outputs
VMOVDQU64 (R10), Z20
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z12, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z13, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 7 to 2 outputs
VMOVDQU64 (R11), Z20
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z14, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z15, Z20, Z21
VXORPD Z19, Z21, Z19
// Load and process 64 bytes from input 8 to 2 outputs
VMOVDQU64 (CX), Z20
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z16, Z20, Z21
VXORPD Z18, Z21, Z18
VGF2P8AFFINEQB $0x00, Z17, Z20, Z21
VXORPD Z19, Z21, Z19
// Store 2 outputs
VMOVDQU64 Z18, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z19, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ AX
JNZ mulGFNI_9x2_64Xor_loop
VZEROUPPER
mulGFNI_9x2_64Xor_end:
RET
// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88
// Loading 12 of 18 tables to registers
// Destination kept in GP registers
// Full registers estimated 22 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R13
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R14
ADDQ R15, R13
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, DX
mulAvxGFNI_9x2Xor_loop:
// Load 2 outputs
VMOVDQU (R14), Y12
VMOVDQU (R13), Y13
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 2 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 2 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R14)
ADDQ $0x20, R14
VMOVDQU Y13, (R13)
ADDQ $0x20, R13
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_9x2Xor_loop
VZEROUPPER
mulAvxGFNI_9x2Xor_end:
RET
// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x3_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
VBROADCASTF32X2 208(CX), Z26
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), CX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R12
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R12
// Add start offset to input
ADDQ R15, DX
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, CX
mulGFNI_9x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z27
VGF2P8AFFINEQB $0x00, Z1, Z30, Z28
VGF2P8AFFINEQB $0x00, Z2, Z30, Z29
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 3 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 3 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 3 outputs
VMOVDQU64 (CX), Z30
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
VXORPD Z29, Z31, Z29
// Store 3 outputs
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ AX
JNZ mulGFNI_9x3_64_loop
VZEROUPPER
mulGFNI_9x3_64_end:
RET
// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x3(SB), $8-88
// Loading 11 of 27 tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R15
MOVQ 48(R13), R13
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R13
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, DX
mulAvxGFNI_9x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 3 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 3 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R13)
ADDQ $0x20, R13
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_9x3_loop
VZEROUPPER
mulAvxGFNI_9x3_end:
RET
// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x3_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
VBROADCASTF32X2 208(CX), Z26
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), CX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R12
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R13
ADDQ R15, R14
ADDQ R15, R12
// Add start offset to input
ADDQ R15, DX
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, CX
mulGFNI_9x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (R13), Z27
VMOVDQU64 (R14), Z28
VMOVDQU64 (R12), Z29
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 3 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 3 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 3 outputs
VMOVDQU64 (CX), Z30
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
VXORPD Z29, Z31, Z29
// Store 3 outputs
VMOVDQU64 Z27, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z28, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z29, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ AX
JNZ mulGFNI_9x3_64Xor_loop
VZEROUPPER
mulGFNI_9x3_64Xor_end:
RET
// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88
// Loading 11 of 27 tables to registers
// Destination kept in GP registers
// Full registers estimated 32 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R15
MOVQ 48(R13), R13
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R13
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, DX
mulAvxGFNI_9x3Xor_loop:
// Load 3 outputs
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R13), Y13
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 3 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 3 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R13)
ADDQ $0x20, R13
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_9x3Xor_loop
VZEROUPPER
mulAvxGFNI_9x3Xor_end:
RET
// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x4_64(SB), $8-88
// Loading 26 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), R11
MOVQ 192(AX), AX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R15
MOVQ 72(R12), R12
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R12
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_9x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 4 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 4 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 4 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 4 outputs
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ BP
JNZ mulGFNI_9x4_64_loop
VZEROUPPER
mulGFNI_9x4_64_end:
RET
// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x4(SB), $8-88
// Loading 10 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), R11
MOVQ 192(AX), AX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R15
MOVQ 72(R12), R12
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R12
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_9x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 4 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 4 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 4 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_9x4_loop
VZEROUPPER
mulAvxGFNI_9x4_end:
RET
// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x4_64Xor(SB), $8-88
// Loading 26 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), R11
MOVQ 192(AX), AX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R15
MOVQ 72(R12), R12
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R12
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_9x4_64Xor_loop:
// Load 4 outputs
VMOVDQU64 (R13), Z26
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R12), Z29
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 4 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 4 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 4 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 4 outputs
VMOVDQU64 Z26, (R13)
ADDQ $0x40, R13
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R12)
ADDQ $0x40, R12
// Prepare for next loop
DECQ BP
JNZ mulGFNI_9x4_64Xor_loop
VZEROUPPER
mulGFNI_9x4_64Xor_end:
RET
// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88
// Loading 10 of 36 tables to registers
// Destination kept in GP registers
// Full registers estimated 42 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), R11
MOVQ 192(AX), AX
MOVQ out_base+48(FP), R12
MOVQ out_base+48(FP), R12
MOVQ (R12), R13
MOVQ 24(R12), R14
MOVQ 48(R12), R15
MOVQ 72(R12), R12
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R13
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R12
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_9x4Xor_loop:
// Load 4 outputs
VMOVDQU (R13), Y10
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R12), Y13
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 4 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 4 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 4 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R12)
ADDQ $0x20, R12
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_9x4Xor_loop
VZEROUPPER
mulAvxGFNI_9x4Xor_end:
RET
// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x5_64(SB), $0-88
// Loading 25 of 45 tables to registers
// Destination kept on stack
// Full registers estimated 52 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 5 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 5 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
MOVQ (R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x5_64_loop
VZEROUPPER
mulGFNI_9x5_64_end:
RET
// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x5(SB), $0-88
// Loading 9 of 45 tables to registers
// Destination kept on stack
// Full registers estimated 52 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 5 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 5 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
MOVQ (R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x5_loop
VZEROUPPER
mulAvxGFNI_9x5_end:
RET
// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x5_64Xor(SB), $0-88
// Loading 25 of 45 tables to registers
// Destination kept on stack
// Full registers estimated 52 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x5_64Xor_loop:
// Load 5 outputs
MOVQ (R13), R15
VMOVDQU64 (R15)(R14*1), Z25
MOVQ 24(R13), R15
VMOVDQU64 (R15)(R14*1), Z26
MOVQ 48(R13), R15
VMOVDQU64 (R15)(R14*1), Z27
MOVQ 72(R13), R15
VMOVDQU64 (R15)(R14*1), Z28
MOVQ 96(R13), R15
VMOVDQU64 (R15)(R14*1), Z29
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 5 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 5 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
MOVQ (R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x5_64Xor_loop
VZEROUPPER
mulGFNI_9x5_64Xor_end:
RET
// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88
// Loading 9 of 45 tables to registers
// Destination kept on stack
// Full registers estimated 52 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x5Xor_loop:
// Load 5 outputs
MOVQ (R13), R15
VMOVDQU (R15)(R14*1), Y9
MOVQ 24(R13), R15
VMOVDQU (R15)(R14*1), Y10
MOVQ 48(R13), R15
VMOVDQU (R15)(R14*1), Y11
MOVQ 72(R13), R15
VMOVDQU (R15)(R14*1), Y12
MOVQ 96(R13), R15
VMOVDQU (R15)(R14*1), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 5 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 5 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
MOVQ (R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x5Xor_loop
VZEROUPPER
mulAvxGFNI_9x5Xor_end:
RET
// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x6_64(SB), $0-88
// Loading 24 of 54 tables to registers
// Destination kept on stack
// Full registers estimated 62 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 6 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 6 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
MOVQ (R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x6_64_loop
VZEROUPPER
mulGFNI_9x6_64_end:
RET
// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x6(SB), $0-88
// Loading 8 of 54 tables to registers
// Destination kept on stack
// Full registers estimated 62 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 6 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 6 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
MOVQ (R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x6_loop
VZEROUPPER
mulAvxGFNI_9x6_end:
RET
// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x6_64Xor(SB), $0-88
// Loading 24 of 54 tables to registers
// Destination kept on stack
// Full registers estimated 62 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x6_64Xor_loop:
// Load 6 outputs
MOVQ (R13), R15
VMOVDQU64 (R15)(R14*1), Z24
MOVQ 24(R13), R15
VMOVDQU64 (R15)(R14*1), Z25
MOVQ 48(R13), R15
VMOVDQU64 (R15)(R14*1), Z26
MOVQ 72(R13), R15
VMOVDQU64 (R15)(R14*1), Z27
MOVQ 96(R13), R15
VMOVDQU64 (R15)(R14*1), Z28
MOVQ 120(R13), R15
VMOVDQU64 (R15)(R14*1), Z29
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 6 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 6 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
MOVQ (R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x6_64Xor_loop
VZEROUPPER
mulGFNI_9x6_64Xor_end:
RET
// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88
// Loading 8 of 54 tables to registers
// Destination kept on stack
// Full registers estimated 62 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x6Xor_loop:
// Load 6 outputs
MOVQ (R13), R15
VMOVDQU (R15)(R14*1), Y8
MOVQ 24(R13), R15
VMOVDQU (R15)(R14*1), Y9
MOVQ 48(R13), R15
VMOVDQU (R15)(R14*1), Y10
MOVQ 72(R13), R15
VMOVDQU (R15)(R14*1), Y11
MOVQ 96(R13), R15
VMOVDQU (R15)(R14*1), Y12
MOVQ 120(R13), R15
VMOVDQU (R15)(R14*1), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 6 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 6 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
MOVQ (R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x6Xor_loop
VZEROUPPER
mulAvxGFNI_9x6Xor_end:
RET
// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x7_64(SB), $0-88
// Loading 23 of 63 tables to registers
// Destination kept on stack
// Full registers estimated 72 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 7 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 7 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
MOVQ (R13), R15
VMOVDQU64 Z23, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x7_64_loop
VZEROUPPER
mulGFNI_9x7_64_end:
RET
// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x7(SB), $0-88
// Loading 7 of 63 tables to registers
// Destination kept on stack
// Full registers estimated 72 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 7 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 7 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
MOVQ (R13), R15
VMOVDQU Y7, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x7_loop
VZEROUPPER
mulAvxGFNI_9x7_end:
RET
// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x7_64Xor(SB), $0-88
// Loading 23 of 63 tables to registers
// Destination kept on stack
// Full registers estimated 72 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x7_64Xor_loop:
// Load 7 outputs
MOVQ (R13), R15
VMOVDQU64 (R15)(R14*1), Z23
MOVQ 24(R13), R15
VMOVDQU64 (R15)(R14*1), Z24
MOVQ 48(R13), R15
VMOVDQU64 (R15)(R14*1), Z25
MOVQ 72(R13), R15
VMOVDQU64 (R15)(R14*1), Z26
MOVQ 96(R13), R15
VMOVDQU64 (R15)(R14*1), Z27
MOVQ 120(R13), R15
VMOVDQU64 (R15)(R14*1), Z28
MOVQ 144(R13), R15
VMOVDQU64 (R15)(R14*1), Z29
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 7 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 7 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
MOVQ (R13), R15
VMOVDQU64 Z23, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x7_64Xor_loop
VZEROUPPER
mulGFNI_9x7_64Xor_end:
RET
// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88
// Loading 7 of 63 tables to registers
// Destination kept on stack
// Full registers estimated 72 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x7Xor_loop:
// Load 7 outputs
MOVQ (R13), R15
VMOVDQU (R15)(R14*1), Y7
MOVQ 24(R13), R15
VMOVDQU (R15)(R14*1), Y8
MOVQ 48(R13), R15
VMOVDQU (R15)(R14*1), Y9
MOVQ 72(R13), R15
VMOVDQU (R15)(R14*1), Y10
MOVQ 96(R13), R15
VMOVDQU (R15)(R14*1), Y11
MOVQ 120(R13), R15
VMOVDQU (R15)(R14*1), Y12
MOVQ 144(R13), R15
VMOVDQU (R15)(R14*1), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 7 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 7 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
MOVQ (R13), R15
VMOVDQU Y7, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x7Xor_loop
VZEROUPPER
mulAvxGFNI_9x7Xor_end:
RET
// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x8_64(SB), $0-88
// Loading 22 of 72 tables to registers
// Destination kept on stack
// Full registers estimated 82 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 8 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 8 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R13), R15
VMOVDQU64 Z22, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z23, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x8_64_loop
VZEROUPPER
mulGFNI_9x8_64_end:
RET
// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x8(SB), $0-88
// Loading 6 of 72 tables to registers
// Destination kept on stack
// Full registers estimated 82 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 8 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 8 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R13), R15
VMOVDQU Y6, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y7, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x8_loop
VZEROUPPER
mulAvxGFNI_9x8_end:
RET
// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x8_64Xor(SB), $0-88
// Loading 22 of 72 tables to registers
// Destination kept on stack
// Full registers estimated 82 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x8_64Xor_loop:
// Load 8 outputs
MOVQ (R13), R15
VMOVDQU64 (R15)(R14*1), Z22
MOVQ 24(R13), R15
VMOVDQU64 (R15)(R14*1), Z23
MOVQ 48(R13), R15
VMOVDQU64 (R15)(R14*1), Z24
MOVQ 72(R13), R15
VMOVDQU64 (R15)(R14*1), Z25
MOVQ 96(R13), R15
VMOVDQU64 (R15)(R14*1), Z26
MOVQ 120(R13), R15
VMOVDQU64 (R15)(R14*1), Z27
MOVQ 144(R13), R15
VMOVDQU64 (R15)(R14*1), Z28
MOVQ 168(R13), R15
VMOVDQU64 (R15)(R14*1), Z29
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 8 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 8 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R13), R15
VMOVDQU64 Z22, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z23, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x8_64Xor_loop
VZEROUPPER
mulGFNI_9x8_64Xor_end:
RET
// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88
// Loading 6 of 72 tables to registers
// Destination kept on stack
// Full registers estimated 82 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x8Xor_loop:
// Load 8 outputs
MOVQ (R13), R15
VMOVDQU (R15)(R14*1), Y6
MOVQ 24(R13), R15
VMOVDQU (R15)(R14*1), Y7
MOVQ 48(R13), R15
VMOVDQU (R15)(R14*1), Y8
MOVQ 72(R13), R15
VMOVDQU (R15)(R14*1), Y9
MOVQ 96(R13), R15
VMOVDQU (R15)(R14*1), Y10
MOVQ 120(R13), R15
VMOVDQU (R15)(R14*1), Y11
MOVQ 144(R13), R15
VMOVDQU (R15)(R14*1), Y12
MOVQ 168(R13), R15
VMOVDQU (R15)(R14*1), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 8 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 8 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R13), R15
VMOVDQU Y6, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y7, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x8Xor_loop
VZEROUPPER
mulAvxGFNI_9x8Xor_end:
RET
// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x9_64(SB), $0-88
// Loading 21 of 81 tables to registers
// Destination kept on stack
// Full registers estimated 92 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 9 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 9 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R13), R15
VMOVDQU64 Z21, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z22, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z23, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 192(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x9_64_loop
VZEROUPPER
mulGFNI_9x9_64_end:
RET
// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x9(SB), $0-88
// Loading 5 of 81 tables to registers
// Destination kept on stack
// Full registers estimated 92 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 9 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 9 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 640(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R13), R15
VMOVDQU Y5, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y6, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y7, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 192(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x9_loop
VZEROUPPER
mulAvxGFNI_9x9_end:
RET
// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x9_64Xor(SB), $0-88
// Loading 21 of 81 tables to registers
// Destination kept on stack
// Full registers estimated 92 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x9_64Xor_loop:
// Load 9 outputs
MOVQ (R13), R15
VMOVDQU64 (R15)(R14*1), Z21
MOVQ 24(R13), R15
VMOVDQU64 (R15)(R14*1), Z22
MOVQ 48(R13), R15
VMOVDQU64 (R15)(R14*1), Z23
MOVQ 72(R13), R15
VMOVDQU64 (R15)(R14*1), Z24
MOVQ 96(R13), R15
VMOVDQU64 (R15)(R14*1), Z25
MOVQ 120(R13), R15
VMOVDQU64 (R15)(R14*1), Z26
MOVQ 144(R13), R15
VMOVDQU64 (R15)(R14*1), Z27
MOVQ 168(R13), R15
VMOVDQU64 (R15)(R14*1), Z28
MOVQ 192(R13), R15
VMOVDQU64 (R15)(R14*1), Z29
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 9 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 9 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R13), R15
VMOVDQU64 Z21, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z22, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z23, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 192(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x9_64Xor_loop
VZEROUPPER
mulGFNI_9x9_64Xor_end:
RET
// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88
// Loading 5 of 81 tables to registers
// Destination kept on stack
// Full registers estimated 92 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x9Xor_loop:
// Load 9 outputs
MOVQ (R13), R15
VMOVDQU (R15)(R14*1), Y5
MOVQ 24(R13), R15
VMOVDQU (R15)(R14*1), Y6
MOVQ 48(R13), R15
VMOVDQU (R15)(R14*1), Y7
MOVQ 72(R13), R15
VMOVDQU (R15)(R14*1), Y8
MOVQ 96(R13), R15
VMOVDQU (R15)(R14*1), Y9
MOVQ 120(R13), R15
VMOVDQU (R15)(R14*1), Y10
MOVQ 144(R13), R15
VMOVDQU (R15)(R14*1), Y11
MOVQ 168(R13), R15
VMOVDQU (R15)(R14*1), Y12
MOVQ 192(R13), R15
VMOVDQU (R15)(R14*1), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 9 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 9 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 640(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R13), R15
VMOVDQU Y5, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y6, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y7, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 192(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x9Xor_loop
VZEROUPPER
mulAvxGFNI_9x9Xor_end:
RET
// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x10_64(SB), $0-88
// Loading 20 of 90 tables to registers
// Destination kept on stack
// Full registers estimated 102 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 10 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 10 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R13), R15
VMOVDQU64 Z20, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z21, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z22, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z23, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 192(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 216(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x10_64_loop
VZEROUPPER
mulGFNI_9x10_64_end:
RET
// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x10(SB), $0-88
// Loading 4 of 90 tables to registers
// Destination kept on stack
// Full registers estimated 102 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 10 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 10 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 640(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 648(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 656(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 664(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 672(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 680(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 688(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 696(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 704(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 712(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R13), R15
VMOVDQU Y4, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y5, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y6, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y7, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 192(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 216(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x10_loop
VZEROUPPER
mulAvxGFNI_9x10_end:
RET
// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_9x10_64Xor(SB), $0-88
// Loading 20 of 90 tables to registers
// Destination kept on stack
// Full registers estimated 102 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_9x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulGFNI_9x10_64Xor_loop:
// Load 10 outputs
MOVQ (R13), R15
VMOVDQU64 (R15)(R14*1), Z20
MOVQ 24(R13), R15
VMOVDQU64 (R15)(R14*1), Z21
MOVQ 48(R13), R15
VMOVDQU64 (R15)(R14*1), Z22
MOVQ 72(R13), R15
VMOVDQU64 (R15)(R14*1), Z23
MOVQ 96(R13), R15
VMOVDQU64 (R15)(R14*1), Z24
MOVQ 120(R13), R15
VMOVDQU64 (R15)(R14*1), Z25
MOVQ 144(R13), R15
VMOVDQU64 (R15)(R14*1), Z26
MOVQ 168(R13), R15
VMOVDQU64 (R15)(R14*1), Z27
MOVQ 192(R13), R15
VMOVDQU64 (R15)(R14*1), Z28
MOVQ 216(R13), R15
VMOVDQU64 (R15)(R14*1), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 10 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 10 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R13), R15
VMOVDQU64 Z20, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU64 Z21, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU64 Z22, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU64 Z23, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU64 Z24, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU64 Z25, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU64 Z26, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU64 Z27, (R15)(R14*1)
MOVQ 192(R13), R15
VMOVDQU64 Z28, (R15)(R14*1)
MOVQ 216(R13), R15
VMOVDQU64 Z29, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x40, R14
DECQ AX
JNZ mulGFNI_9x10_64Xor_loop
VZEROUPPER
mulGFNI_9x10_64Xor_end:
RET
// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88
// Loading 4 of 90 tables to registers
// Destination kept on stack
// Full registers estimated 102 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_9x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), DX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ start+72(FP), R14
// Add start offset to input
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, DX
mulAvxGFNI_9x10Xor_loop:
// Load 10 outputs
MOVQ (R13), R15
VMOVDQU (R15)(R14*1), Y4
MOVQ 24(R13), R15
VMOVDQU (R15)(R14*1), Y5
MOVQ 48(R13), R15
VMOVDQU (R15)(R14*1), Y6
MOVQ 72(R13), R15
VMOVDQU (R15)(R14*1), Y7
MOVQ 96(R13), R15
VMOVDQU (R15)(R14*1), Y8
MOVQ 120(R13), R15
VMOVDQU (R15)(R14*1), Y9
MOVQ 144(R13), R15
VMOVDQU (R15)(R14*1), Y10
MOVQ 168(R13), R15
VMOVDQU (R15)(R14*1), Y11
MOVQ 192(R13), R15
VMOVDQU (R15)(R14*1), Y12
MOVQ 216(R13), R15
VMOVDQU (R15)(R14*1), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 10 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 10 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 640(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 648(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 656(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 664(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 672(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 680(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 688(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 696(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 704(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 712(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R13), R15
VMOVDQU Y4, (R15)(R14*1)
MOVQ 24(R13), R15
VMOVDQU Y5, (R15)(R14*1)
MOVQ 48(R13), R15
VMOVDQU Y6, (R15)(R14*1)
MOVQ 72(R13), R15
VMOVDQU Y7, (R15)(R14*1)
MOVQ 96(R13), R15
VMOVDQU Y8, (R15)(R14*1)
MOVQ 120(R13), R15
VMOVDQU Y9, (R15)(R14*1)
MOVQ 144(R13), R15
VMOVDQU Y10, (R15)(R14*1)
MOVQ 168(R13), R15
VMOVDQU Y11, (R15)(R14*1)
MOVQ 192(R13), R15
VMOVDQU Y12, (R15)(R14*1)
MOVQ 216(R13), R15
VMOVDQU Y13, (R15)(R14*1)
// Prepare for next loop
ADDQ $0x20, R14
DECQ AX
JNZ mulAvxGFNI_9x10Xor_loop
VZEROUPPER
mulAvxGFNI_9x10Xor_end:
RET
// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x1_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 13 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x1_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), R12
MOVQ 216(CX), CX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R13
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R13
// Add start offset to input
ADDQ R14, DX
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, CX
mulGFNI_10x1_64_loop:
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z11
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z11, Z10
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z11
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z11
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z11
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z11
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (R9), Z11
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 6 to 1 outputs
VMOVDQU64 (R10), Z11
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z6, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 7 to 1 outputs
VMOVDQU64 (R11), Z11
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z7, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 8 to 1 outputs
VMOVDQU64 (R12), Z11
ADDQ $0x40, R12
VGF2P8AFFINEQB $0x00, Z8, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 9 to 1 outputs
VMOVDQU64 (CX), Z11
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z9, Z11, Z11
VXORPD Z10, Z11, Z10
// Store 1 outputs
VMOVDQU64 Z10, (R13)
ADDQ $0x40, R13
// Prepare for next loop
DECQ AX
JNZ mulGFNI_10x1_64_loop
VZEROUPPER
mulGFNI_10x1_64_end:
RET
// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x1(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 13 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x1_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), R12
MOVQ 216(CX), CX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R13
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R13
// Add start offset to input
ADDQ R14, DX
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, CX
mulAvxGFNI_10x1_loop:
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y11
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y11, Y10
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y11
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y11
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y11
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y11
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (R9), Y11
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 6 to 1 outputs
VMOVDQU (R10), Y11
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y6, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 7 to 1 outputs
VMOVDQU (R11), Y11
ADDQ $0x20, R11
VGF2P8AFFINEQB $0x00, Y7, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 8 to 1 outputs
VMOVDQU (R12), Y11
ADDQ $0x20, R12
VGF2P8AFFINEQB $0x00, Y8, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 9 to 1 outputs
VMOVDQU (CX), Y11
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y9, Y11, Y11
VXORPD Y10, Y11, Y10
// Store 1 outputs
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_10x1_loop
VZEROUPPER
mulAvxGFNI_10x1_end:
RET
// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x1_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 13 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x1_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), R12
MOVQ 216(CX), CX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R13
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R13
// Add start offset to input
ADDQ R14, DX
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, CX
mulGFNI_10x1_64Xor_loop:
// Load 1 outputs
VMOVDQU64 (R13), Z10
// Load and process 64 bytes from input 0 to 1 outputs
VMOVDQU64 (DX), Z11
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 1 to 1 outputs
VMOVDQU64 (BX), Z11
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z1, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 2 to 1 outputs
VMOVDQU64 (SI), Z11
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z2, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 3 to 1 outputs
VMOVDQU64 (DI), Z11
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z3, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 4 to 1 outputs
VMOVDQU64 (R8), Z11
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z4, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 5 to 1 outputs
VMOVDQU64 (R9), Z11
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z5, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 6 to 1 outputs
VMOVDQU64 (R10), Z11
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z6, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 7 to 1 outputs
VMOVDQU64 (R11), Z11
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z7, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 8 to 1 outputs
VMOVDQU64 (R12), Z11
ADDQ $0x40, R12
VGF2P8AFFINEQB $0x00, Z8, Z11, Z11
VXORPD Z10, Z11, Z10
// Load and process 64 bytes from input 9 to 1 outputs
VMOVDQU64 (CX), Z11
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z9, Z11, Z11
VXORPD Z10, Z11, Z10
// Store 1 outputs
VMOVDQU64 Z10, (R13)
ADDQ $0x40, R13
// Prepare for next loop
DECQ AX
JNZ mulGFNI_10x1_64Xor_loop
VZEROUPPER
mulGFNI_10x1_64Xor_end:
RET
// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 13 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x1Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), R12
MOVQ 216(CX), CX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R13
MOVQ start+72(FP), R14
// Add start offset to output
ADDQ R14, R13
// Add start offset to input
ADDQ R14, DX
ADDQ R14, BX
ADDQ R14, SI
ADDQ R14, DI
ADDQ R14, R8
ADDQ R14, R9
ADDQ R14, R10
ADDQ R14, R11
ADDQ R14, R12
ADDQ R14, CX
mulAvxGFNI_10x1Xor_loop:
// Load 1 outputs
VMOVDQU (R13), Y10
// Load and process 32 bytes from input 0 to 1 outputs
VMOVDQU (DX), Y11
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 1 to 1 outputs
VMOVDQU (BX), Y11
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y1, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 2 to 1 outputs
VMOVDQU (SI), Y11
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 3 to 1 outputs
VMOVDQU (DI), Y11
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y3, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 4 to 1 outputs
VMOVDQU (R8), Y11
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y4, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 5 to 1 outputs
VMOVDQU (R9), Y11
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y5, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 6 to 1 outputs
VMOVDQU (R10), Y11
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y6, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 7 to 1 outputs
VMOVDQU (R11), Y11
ADDQ $0x20, R11
VGF2P8AFFINEQB $0x00, Y7, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 8 to 1 outputs
VMOVDQU (R12), Y11
ADDQ $0x20, R12
VGF2P8AFFINEQB $0x00, Y8, Y11, Y11
VXORPD Y10, Y11, Y10
// Load and process 32 bytes from input 9 to 1 outputs
VMOVDQU (CX), Y11
ADDQ $0x20, CX
VGF2P8AFFINEQB $0x00, Y9, Y11, Y11
VXORPD Y10, Y11, Y10
// Store 1 outputs
VMOVDQU Y10, (R13)
ADDQ $0x20, R13
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_10x1Xor_loop
VZEROUPPER
mulAvxGFNI_10x1Xor_end:
RET
// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x2_64(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 24 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x2_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), R12
MOVQ 216(CX), CX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R13
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R14
ADDQ R15, R13
// Add start offset to input
ADDQ R15, DX
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, CX
mulGFNI_10x2_64_loop:
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z22
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z22, Z20
VGF2P8AFFINEQB $0x00, Z1, Z22, Z21
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z22
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z3, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z22
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z5, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z22
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z7, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z22
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z9, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (R9), Z22
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z10, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z11, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 6 to 2 outputs
VMOVDQU64 (R10), Z22
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z12, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z13, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 7 to 2 outputs
VMOVDQU64 (R11), Z22
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z14, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z15, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 8 to 2 outputs
VMOVDQU64 (R12), Z22
ADDQ $0x40, R12
VGF2P8AFFINEQB $0x00, Z16, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z17, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 9 to 2 outputs
VMOVDQU64 (CX), Z22
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z18, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z19, Z22, Z23
VXORPD Z21, Z23, Z21
// Store 2 outputs
VMOVDQU64 Z20, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z21, (R13)
ADDQ $0x40, R13
// Prepare for next loop
DECQ AX
JNZ mulGFNI_10x2_64_loop
VZEROUPPER
mulGFNI_10x2_64_end:
RET
// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x2(SB), $8-88
// Loading 12 of 20 tables to registers
// Destination kept in GP registers
// Full registers estimated 24 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x2_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ (R14), R15
MOVQ 24(R14), R14
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R15
ADDQ BP, R14
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, DX
mulAvxGFNI_10x2_loop:
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 2 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 2 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 2 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R14)
ADDQ $0x20, R14
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_10x2_loop
VZEROUPPER
mulAvxGFNI_10x2_end:
RET
// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x2_64Xor(SB), $0-88
// Loading all tables to registers
// Destination kept in GP registers
// Full registers estimated 24 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x2_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), CX
MOVQ (CX), DX
MOVQ 24(CX), BX
MOVQ 48(CX), SI
MOVQ 72(CX), DI
MOVQ 96(CX), R8
MOVQ 120(CX), R9
MOVQ 144(CX), R10
MOVQ 168(CX), R11
MOVQ 192(CX), R12
MOVQ 216(CX), CX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R13
MOVQ start+72(FP), R15
// Add start offset to output
ADDQ R15, R14
ADDQ R15, R13
// Add start offset to input
ADDQ R15, DX
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, CX
mulGFNI_10x2_64Xor_loop:
// Load 2 outputs
VMOVDQU64 (R14), Z20
VMOVDQU64 (R13), Z21
// Load and process 64 bytes from input 0 to 2 outputs
VMOVDQU64 (DX), Z22
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z1, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 1 to 2 outputs
VMOVDQU64 (BX), Z22
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z2, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z3, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 2 to 2 outputs
VMOVDQU64 (SI), Z22
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z5, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 3 to 2 outputs
VMOVDQU64 (DI), Z22
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z6, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z7, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 4 to 2 outputs
VMOVDQU64 (R8), Z22
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z8, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z9, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 5 to 2 outputs
VMOVDQU64 (R9), Z22
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z10, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z11, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 6 to 2 outputs
VMOVDQU64 (R10), Z22
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z12, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z13, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 7 to 2 outputs
VMOVDQU64 (R11), Z22
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z14, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z15, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 8 to 2 outputs
VMOVDQU64 (R12), Z22
ADDQ $0x40, R12
VGF2P8AFFINEQB $0x00, Z16, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z17, Z22, Z23
VXORPD Z21, Z23, Z21
// Load and process 64 bytes from input 9 to 2 outputs
VMOVDQU64 (CX), Z22
ADDQ $0x40, CX
VGF2P8AFFINEQB $0x00, Z18, Z22, Z23
VXORPD Z20, Z23, Z20
VGF2P8AFFINEQB $0x00, Z19, Z22, Z23
VXORPD Z21, Z23, Z21
// Store 2 outputs
VMOVDQU64 Z20, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z21, (R13)
ADDQ $0x40, R13
// Prepare for next loop
DECQ AX
JNZ mulGFNI_10x2_64Xor_loop
VZEROUPPER
mulGFNI_10x2_64Xor_end:
RET
// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88
// Loading 12 of 20 tables to registers
// Destination kept in GP registers
// Full registers estimated 24 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x2Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
VBROADCASTSD 88(CX), Y11
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ (R14), R15
MOVQ 24(R14), R14
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R15
ADDQ BP, R14
// Add start offset to input
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, R13
ADDQ BP, DX
mulAvxGFNI_10x2Xor_loop:
// Load 2 outputs
VMOVDQU (R15), Y12
VMOVDQU (R14), Y13
// Load and process 32 bytes from input 0 to 2 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 2 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 2 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 2 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 2 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 2 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y11, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 2 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 2 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 2 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 2 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 2 outputs
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R14)
ADDQ $0x20, R14
// Prepare for next loop
DECQ AX
JNZ mulAvxGFNI_10x2Xor_loop
VZEROUPPER
mulAvxGFNI_10x2Xor_end:
RET
// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x3_64(SB), $8-88
// Loading 27 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 35 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x3_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
VBROADCASTF32X2 208(CX), Z26
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), R11
MOVQ 192(AX), R12
MOVQ 216(AX), AX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R15
MOVQ 48(R13), R13
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R13
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_10x3_64_loop:
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z27
VGF2P8AFFINEQB $0x00, Z1, Z30, Z28
VGF2P8AFFINEQB $0x00, Z2, Z30, Z29
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 3 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 3 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 3 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 3 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 3 outputs
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R13)
ADDQ $0x40, R13
// Prepare for next loop
DECQ BP
JNZ mulGFNI_10x3_64_loop
VZEROUPPER
mulGFNI_10x3_64_end:
RET
// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x3(SB), $8-88
// Loading 11 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 35 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x3_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), R11
MOVQ 192(AX), R12
MOVQ 216(AX), AX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R15
MOVQ 48(R13), R13
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R13
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_10x3_loop:
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 3 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 3 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 3 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 3 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R13)
ADDQ $0x20, R13
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_10x3_loop
VZEROUPPER
mulAvxGFNI_10x3_end:
RET
// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x3_64Xor(SB), $8-88
// Loading 27 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 35 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x3_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
VBROADCASTF32X2 208(CX), Z26
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), R11
MOVQ 192(AX), R12
MOVQ 216(AX), AX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R15
MOVQ 48(R13), R13
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R13
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x06, BP
mulGFNI_10x3_64Xor_loop:
// Load 3 outputs
VMOVDQU64 (R14), Z27
VMOVDQU64 (R15), Z28
VMOVDQU64 (R13), Z29
// Load and process 64 bytes from input 0 to 3 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 3 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 3 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 3 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 3 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 3 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 3 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 3 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 3 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z26, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 3 outputs
VMOVDQU64 (AX), Z30
ADDQ $0x40, AX
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 3 outputs
VMOVDQU64 Z27, (R14)
ADDQ $0x40, R14
VMOVDQU64 Z28, (R15)
ADDQ $0x40, R15
VMOVDQU64 Z29, (R13)
ADDQ $0x40, R13
// Prepare for next loop
DECQ BP
JNZ mulGFNI_10x3_64Xor_loop
VZEROUPPER
mulGFNI_10x3_64Xor_end:
RET
// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88
// Loading 11 of 30 tables to registers
// Destination kept in GP registers
// Full registers estimated 35 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x3Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
VBROADCASTSD 80(CX), Y10
MOVQ in_base+24(FP), AX
MOVQ (AX), DX
MOVQ 24(AX), BX
MOVQ 48(AX), SI
MOVQ 72(AX), DI
MOVQ 96(AX), R8
MOVQ 120(AX), R9
MOVQ 144(AX), R10
MOVQ 168(AX), R11
MOVQ 192(AX), R12
MOVQ 216(AX), AX
MOVQ out_base+48(FP), R13
MOVQ out_base+48(FP), R13
MOVQ (R13), R14
MOVQ 24(R13), R15
MOVQ 48(R13), R13
MOVQ start+72(FP), BP
// Add start offset to output
ADDQ BP, R14
ADDQ BP, R15
ADDQ BP, R13
// Add start offset to input
ADDQ BP, DX
ADDQ BP, BX
ADDQ BP, SI
ADDQ BP, DI
ADDQ BP, R8
ADDQ BP, R9
ADDQ BP, R10
ADDQ BP, R11
ADDQ BP, R12
ADDQ BP, AX
// Reload length to save a register
MOVQ n+80(FP), BP
SHRQ $0x05, BP
mulAvxGFNI_10x3Xor_loop:
// Load 3 outputs
VMOVDQU (R14), Y11
VMOVDQU (R15), Y12
VMOVDQU (R13), Y13
// Load and process 32 bytes from input 0 to 3 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 3 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 3 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 3 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y10, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 3 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 3 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 3 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 3 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 3 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 3 outputs
VMOVDQU (AX), Y14
ADDQ $0x20, AX
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 3 outputs
VMOVDQU Y11, (R14)
ADDQ $0x20, R14
VMOVDQU Y12, (R15)
ADDQ $0x20, R15
VMOVDQU Y13, (R13)
ADDQ $0x20, R13
// Prepare for next loop
DECQ BP
JNZ mulAvxGFNI_10x3Xor_loop
VZEROUPPER
mulAvxGFNI_10x3Xor_end:
RET
// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x4_64(SB), $8-88
// Loading 26 of 40 tables to registers
// Destination kept on stack
// Full registers estimated 46 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x4_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x4_64_loop:
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z26
VGF2P8AFFINEQB $0x00, Z1, Z30, Z27
VGF2P8AFFINEQB $0x00, Z2, Z30, Z28
VGF2P8AFFINEQB $0x00, Z3, Z30, Z29
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 4 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 4 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 4 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 4 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 4 outputs
MOVQ (R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x4_64_loop
VZEROUPPER
mulGFNI_10x4_64_end:
RET
// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x4(SB), $8-88
// Loading 10 of 40 tables to registers
// Destination kept on stack
// Full registers estimated 46 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x4_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x4_loop:
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 4 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 4 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 4 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
MOVQ (R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x4_loop
VZEROUPPER
mulAvxGFNI_10x4_end:
RET
// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x4_64Xor(SB), $8-88
// Loading 26 of 40 tables to registers
// Destination kept on stack
// Full registers estimated 46 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x4_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
VBROADCASTF32X2 200(CX), Z25
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x4_64Xor_loop:
// Load 4 outputs
MOVQ (R14), BP
VMOVDQU64 (BP)(R15*1), Z26
MOVQ 24(R14), BP
VMOVDQU64 (BP)(R15*1), Z27
MOVQ 48(R14), BP
VMOVDQU64 (BP)(R15*1), Z28
MOVQ 72(R14), BP
VMOVDQU64 (BP)(R15*1), Z29
// Load and process 64 bytes from input 0 to 4 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 4 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 4 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 4 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 4 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 4 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 4 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z25, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 4 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 4 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 4 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 4 outputs
MOVQ (R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x4_64Xor_loop
VZEROUPPER
mulGFNI_10x4_64Xor_end:
RET
// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88
// Loading 10 of 40 tables to registers
// Destination kept on stack
// Full registers estimated 46 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x4Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
VBROADCASTSD 72(CX), Y9
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x4Xor_loop:
// Load 4 outputs
MOVQ (R14), BP
VMOVDQU (BP)(R15*1), Y10
MOVQ 24(R14), BP
VMOVDQU (BP)(R15*1), Y11
MOVQ 48(R14), BP
VMOVDQU (BP)(R15*1), Y12
MOVQ 72(R14), BP
VMOVDQU (BP)(R15*1), Y13
// Load and process 32 bytes from input 0 to 4 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 4 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 4 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y9, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 4 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 4 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 4 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 4 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 4 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 4 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 4 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 4 outputs
MOVQ (R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x4Xor_loop
VZEROUPPER
mulAvxGFNI_10x4Xor_end:
RET
// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x5_64(SB), $8-88
// Loading 25 of 50 tables to registers
// Destination kept on stack
// Full registers estimated 57 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x5_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x5_64_loop:
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 5 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 5 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 5 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
MOVQ (R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x5_64_loop
VZEROUPPER
mulGFNI_10x5_64_end:
RET
// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x5(SB), $8-88
// Loading 9 of 50 tables to registers
// Destination kept on stack
// Full registers estimated 57 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x5_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x5_loop:
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 5 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 5 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 5 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
MOVQ (R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x5_loop
VZEROUPPER
mulAvxGFNI_10x5_end:
RET
// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x5_64Xor(SB), $8-88
// Loading 25 of 50 tables to registers
// Destination kept on stack
// Full registers estimated 57 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x5_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
VBROADCASTF32X2 192(CX), Z24
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x5_64Xor_loop:
// Load 5 outputs
MOVQ (R14), BP
VMOVDQU64 (BP)(R15*1), Z25
MOVQ 24(R14), BP
VMOVDQU64 (BP)(R15*1), Z26
MOVQ 48(R14), BP
VMOVDQU64 (BP)(R15*1), Z27
MOVQ 72(R14), BP
VMOVDQU64 (BP)(R15*1), Z28
MOVQ 96(R14), BP
VMOVDQU64 (BP)(R15*1), Z29
// Load and process 64 bytes from input 0 to 5 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 5 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 5 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 5 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 5 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z24, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 5 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 5 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 5 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 5 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 5 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 5 outputs
MOVQ (R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x5_64Xor_loop
VZEROUPPER
mulGFNI_10x5_64Xor_end:
RET
// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88
// Loading 9 of 50 tables to registers
// Destination kept on stack
// Full registers estimated 57 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x5Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
VBROADCASTSD 64(CX), Y8
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x5Xor_loop:
// Load 5 outputs
MOVQ (R14), BP
VMOVDQU (BP)(R15*1), Y9
MOVQ 24(R14), BP
VMOVDQU (BP)(R15*1), Y10
MOVQ 48(R14), BP
VMOVDQU (BP)(R15*1), Y11
MOVQ 72(R14), BP
VMOVDQU (BP)(R15*1), Y12
MOVQ 96(R14), BP
VMOVDQU (BP)(R15*1), Y13
// Load and process 32 bytes from input 0 to 5 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 5 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y8, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 5 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 5 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 5 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 5 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 5 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 5 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 5 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 5 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 5 outputs
MOVQ (R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x5Xor_loop
VZEROUPPER
mulAvxGFNI_10x5Xor_end:
RET
// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x6_64(SB), $8-88
// Loading 24 of 60 tables to registers
// Destination kept on stack
// Full registers estimated 68 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x6_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x6_64_loop:
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 6 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 6 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 6 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
MOVQ (R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x6_64_loop
VZEROUPPER
mulGFNI_10x6_64_end:
RET
// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x6(SB), $8-88
// Loading 8 of 60 tables to registers
// Destination kept on stack
// Full registers estimated 68 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x6_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x6_loop:
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 6 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 6 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 6 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
MOVQ (R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x6_loop
VZEROUPPER
mulAvxGFNI_10x6_end:
RET
// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x6_64Xor(SB), $8-88
// Loading 24 of 60 tables to registers
// Destination kept on stack
// Full registers estimated 68 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x6_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
VBROADCASTF32X2 184(CX), Z23
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x6_64Xor_loop:
// Load 6 outputs
MOVQ (R14), BP
VMOVDQU64 (BP)(R15*1), Z24
MOVQ 24(R14), BP
VMOVDQU64 (BP)(R15*1), Z25
MOVQ 48(R14), BP
VMOVDQU64 (BP)(R15*1), Z26
MOVQ 72(R14), BP
VMOVDQU64 (BP)(R15*1), Z27
MOVQ 96(R14), BP
VMOVDQU64 (BP)(R15*1), Z28
MOVQ 120(R14), BP
VMOVDQU64 (BP)(R15*1), Z29
// Load and process 64 bytes from input 0 to 6 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 6 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 6 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 6 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z23, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 6 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 6 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 6 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 6 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 6 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 6 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 6 outputs
MOVQ (R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x6_64Xor_loop
VZEROUPPER
mulGFNI_10x6_64Xor_end:
RET
// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88
// Loading 8 of 60 tables to registers
// Destination kept on stack
// Full registers estimated 68 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x6Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
VBROADCASTSD 56(CX), Y7
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x6Xor_loop:
// Load 6 outputs
MOVQ (R14), BP
VMOVDQU (BP)(R15*1), Y8
MOVQ 24(R14), BP
VMOVDQU (BP)(R15*1), Y9
MOVQ 48(R14), BP
VMOVDQU (BP)(R15*1), Y10
MOVQ 72(R14), BP
VMOVDQU (BP)(R15*1), Y11
MOVQ 96(R14), BP
VMOVDQU (BP)(R15*1), Y12
MOVQ 120(R14), BP
VMOVDQU (BP)(R15*1), Y13
// Load and process 32 bytes from input 0 to 6 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 6 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y7, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 6 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 6 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 6 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 6 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 6 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 6 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 6 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 6 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 6 outputs
MOVQ (R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x6Xor_loop
VZEROUPPER
mulAvxGFNI_10x6Xor_end:
RET
// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x7_64(SB), $8-88
// Loading 23 of 70 tables to registers
// Destination kept on stack
// Full registers estimated 79 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x7_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x7_64_loop:
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 7 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 7 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 7 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
MOVQ (R14), BP
VMOVDQU64 Z23, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x7_64_loop
VZEROUPPER
mulGFNI_10x7_64_end:
RET
// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x7(SB), $8-88
// Loading 7 of 70 tables to registers
// Destination kept on stack
// Full registers estimated 79 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x7_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x7_loop:
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 7 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 7 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 7 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
MOVQ (R14), BP
VMOVDQU Y7, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x7_loop
VZEROUPPER
mulAvxGFNI_10x7_end:
RET
// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x7_64Xor(SB), $8-88
// Loading 23 of 70 tables to registers
// Destination kept on stack
// Full registers estimated 79 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x7_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
VBROADCASTF32X2 176(CX), Z22
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x7_64Xor_loop:
// Load 7 outputs
MOVQ (R14), BP
VMOVDQU64 (BP)(R15*1), Z23
MOVQ 24(R14), BP
VMOVDQU64 (BP)(R15*1), Z24
MOVQ 48(R14), BP
VMOVDQU64 (BP)(R15*1), Z25
MOVQ 72(R14), BP
VMOVDQU64 (BP)(R15*1), Z26
MOVQ 96(R14), BP
VMOVDQU64 (BP)(R15*1), Z27
MOVQ 120(R14), BP
VMOVDQU64 (BP)(R15*1), Z28
MOVQ 144(R14), BP
VMOVDQU64 (BP)(R15*1), Z29
// Load and process 64 bytes from input 0 to 7 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 7 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 7 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 7 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z22, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 7 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 7 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 7 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 7 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 7 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 7 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 7 outputs
MOVQ (R14), BP
VMOVDQU64 Z23, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x7_64Xor_loop
VZEROUPPER
mulGFNI_10x7_64Xor_end:
RET
// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88
// Loading 7 of 70 tables to registers
// Destination kept on stack
// Full registers estimated 79 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x7Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
VBROADCASTSD 48(CX), Y6
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x7Xor_loop:
// Load 7 outputs
MOVQ (R14), BP
VMOVDQU (BP)(R15*1), Y7
MOVQ 24(R14), BP
VMOVDQU (BP)(R15*1), Y8
MOVQ 48(R14), BP
VMOVDQU (BP)(R15*1), Y9
MOVQ 72(R14), BP
VMOVDQU (BP)(R15*1), Y10
MOVQ 96(R14), BP
VMOVDQU (BP)(R15*1), Y11
MOVQ 120(R14), BP
VMOVDQU (BP)(R15*1), Y12
MOVQ 144(R14), BP
VMOVDQU (BP)(R15*1), Y13
// Load and process 32 bytes from input 0 to 7 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y11, Y15, Y11
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y12, Y15, Y12
VGF2P8AFFINEQB $0x00, Y6, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 7 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 7 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 7 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 7 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 7 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 7 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 7 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 7 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 7 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 7 outputs
MOVQ (R14), BP
VMOVDQU Y7, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x7Xor_loop
VZEROUPPER
mulAvxGFNI_10x7Xor_end:
RET
// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x8_64(SB), $8-88
// Loading 22 of 80 tables to registers
// Destination kept on stack
// Full registers estimated 90 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x8_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x8_64_loop:
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 8 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 8 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 8 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R14), BP
VMOVDQU64 Z22, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z23, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x8_64_loop
VZEROUPPER
mulGFNI_10x8_64_end:
RET
// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x8(SB), $8-88
// Loading 6 of 80 tables to registers
// Destination kept on stack
// Full registers estimated 90 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x8_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x8_loop:
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y11
VBROADCASTSD 48(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 56(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 8 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 8 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 8 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R14), BP
VMOVDQU Y6, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y7, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x8_loop
VZEROUPPER
mulAvxGFNI_10x8_end:
RET
// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x8_64Xor(SB), $8-88
// Loading 22 of 80 tables to registers
// Destination kept on stack
// Full registers estimated 90 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x8_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
VBROADCASTF32X2 168(CX), Z21
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x8_64Xor_loop:
// Load 8 outputs
MOVQ (R14), BP
VMOVDQU64 (BP)(R15*1), Z22
MOVQ 24(R14), BP
VMOVDQU64 (BP)(R15*1), Z23
MOVQ 48(R14), BP
VMOVDQU64 (BP)(R15*1), Z24
MOVQ 72(R14), BP
VMOVDQU64 (BP)(R15*1), Z25
MOVQ 96(R14), BP
VMOVDQU64 (BP)(R15*1), Z26
MOVQ 120(R14), BP
VMOVDQU64 (BP)(R15*1), Z27
MOVQ 144(R14), BP
VMOVDQU64 (BP)(R15*1), Z28
MOVQ 168(R14), BP
VMOVDQU64 (BP)(R15*1), Z29
// Load and process 64 bytes from input 0 to 8 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 8 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 8 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z21, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 8 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 8 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 8 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 8 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 8 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 8 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 8 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 8 outputs
MOVQ (R14), BP
VMOVDQU64 Z22, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z23, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x8_64Xor_loop
VZEROUPPER
mulGFNI_10x8_64Xor_end:
RET
// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88
// Loading 6 of 80 tables to registers
// Destination kept on stack
// Full registers estimated 90 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x8Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
VBROADCASTSD 40(CX), Y5
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x8Xor_loop:
// Load 8 outputs
MOVQ (R14), BP
VMOVDQU (BP)(R15*1), Y6
MOVQ 24(R14), BP
VMOVDQU (BP)(R15*1), Y7
MOVQ 48(R14), BP
VMOVDQU (BP)(R15*1), Y8
MOVQ 72(R14), BP
VMOVDQU (BP)(R15*1), Y9
MOVQ 96(R14), BP
VMOVDQU (BP)(R15*1), Y10
MOVQ 120(R14), BP
VMOVDQU (BP)(R15*1), Y11
MOVQ 144(R14), BP
VMOVDQU (BP)(R15*1), Y12
MOVQ 168(R14), BP
VMOVDQU (BP)(R15*1), Y13
// Load and process 32 bytes from input 0 to 8 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y9, Y15, Y9
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y10, Y15, Y10
VGF2P8AFFINEQB $0x00, Y5, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 8 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 8 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 8 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 8 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 8 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 8 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 8 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 8 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 8 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 8 outputs
MOVQ (R14), BP
VMOVDQU Y6, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y7, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x8Xor_loop
VZEROUPPER
mulAvxGFNI_10x8Xor_end:
RET
// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x9_64(SB), $8-88
// Loading 21 of 90 tables to registers
// Destination kept on stack
// Full registers estimated 101 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x9_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x9_64_loop:
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 9 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 9 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 9 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R14), BP
VMOVDQU64 Z21, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z22, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z23, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 192(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x9_64_loop
VZEROUPPER
mulGFNI_10x9_64_end:
RET
// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x9(SB), $8-88
// Loading 5 of 90 tables to registers
// Destination kept on stack
// Full registers estimated 101 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x9_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x9_loop:
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y9
VBROADCASTSD 40(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 48(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 56(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 64(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 9 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 9 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 9 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 640(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 648(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 656(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 664(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 672(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 680(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 688(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 696(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 704(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 712(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R14), BP
VMOVDQU Y5, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y6, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y7, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 192(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x9_loop
VZEROUPPER
mulAvxGFNI_10x9_end:
RET
// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x9_64Xor(SB), $8-88
// Loading 21 of 90 tables to registers
// Destination kept on stack
// Full registers estimated 101 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x9_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
VBROADCASTF32X2 160(CX), Z20
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x9_64Xor_loop:
// Load 9 outputs
MOVQ (R14), BP
VMOVDQU64 (BP)(R15*1), Z21
MOVQ 24(R14), BP
VMOVDQU64 (BP)(R15*1), Z22
MOVQ 48(R14), BP
VMOVDQU64 (BP)(R15*1), Z23
MOVQ 72(R14), BP
VMOVDQU64 (BP)(R15*1), Z24
MOVQ 96(R14), BP
VMOVDQU64 (BP)(R15*1), Z25
MOVQ 120(R14), BP
VMOVDQU64 (BP)(R15*1), Z26
MOVQ 144(R14), BP
VMOVDQU64 (BP)(R15*1), Z27
MOVQ 168(R14), BP
VMOVDQU64 (BP)(R15*1), Z28
MOVQ 192(R14), BP
VMOVDQU64 (BP)(R15*1), Z29
// Load and process 64 bytes from input 0 to 9 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 9 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 9 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z20, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 9 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 9 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 9 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 9 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 9 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 9 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 9 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 9 outputs
MOVQ (R14), BP
VMOVDQU64 Z21, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z22, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z23, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 192(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x9_64Xor_loop
VZEROUPPER
mulGFNI_10x9_64Xor_end:
RET
// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88
// Loading 5 of 90 tables to registers
// Destination kept on stack
// Full registers estimated 101 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x9Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
VBROADCASTSD 32(CX), Y4
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x9Xor_loop:
// Load 9 outputs
MOVQ (R14), BP
VMOVDQU (BP)(R15*1), Y5
MOVQ 24(R14), BP
VMOVDQU (BP)(R15*1), Y6
MOVQ 48(R14), BP
VMOVDQU (BP)(R15*1), Y7
MOVQ 72(R14), BP
VMOVDQU (BP)(R15*1), Y8
MOVQ 96(R14), BP
VMOVDQU (BP)(R15*1), Y9
MOVQ 120(R14), BP
VMOVDQU (BP)(R15*1), Y10
MOVQ 144(R14), BP
VMOVDQU (BP)(R15*1), Y11
MOVQ 168(R14), BP
VMOVDQU (BP)(R15*1), Y12
MOVQ 192(R14), BP
VMOVDQU (BP)(R15*1), Y13
// Load and process 32 bytes from input 0 to 9 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y7, Y15, Y7
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y8, Y15, Y8
VGF2P8AFFINEQB $0x00, Y4, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 9 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 9 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 9 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 9 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 9 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 9 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 9 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 9 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 640(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 9 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 648(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 656(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 664(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 672(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 680(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 688(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 696(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 704(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 712(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 9 outputs
MOVQ (R14), BP
VMOVDQU Y5, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y6, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y7, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 192(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x9Xor_loop
VZEROUPPER
mulAvxGFNI_10x9Xor_end:
RET
// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x10_64(SB), $8-88
// Loading 20 of 100 tables to registers
// Destination kept on stack
// Full registers estimated 112 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x10_64_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x10_64_loop:
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 10 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 10 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 10 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R14), BP
VMOVDQU64 Z20, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z21, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z22, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z23, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 192(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 216(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x10_64_loop
VZEROUPPER
mulGFNI_10x10_64_end:
RET
// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x10(SB), $8-88
// Loading 4 of 100 tables to registers
// Destination kept on stack
// Full registers estimated 112 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x10_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x10_loop:
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y7
VBROADCASTSD 32(CX), Y8
VGF2P8AFFINEQB $0x00, Y8, Y14, Y8
VBROADCASTSD 40(CX), Y9
VGF2P8AFFINEQB $0x00, Y9, Y14, Y9
VBROADCASTSD 48(CX), Y10
VGF2P8AFFINEQB $0x00, Y10, Y14, Y10
VBROADCASTSD 56(CX), Y11
VGF2P8AFFINEQB $0x00, Y11, Y14, Y11
VBROADCASTSD 64(CX), Y12
VGF2P8AFFINEQB $0x00, Y12, Y14, Y12
VBROADCASTSD 72(CX), Y13
VGF2P8AFFINEQB $0x00, Y13, Y14, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 10 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 10 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 10 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 640(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 648(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 656(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 664(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 672(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 680(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 688(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 696(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 704(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 712(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 720(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 728(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 736(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 744(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 752(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 760(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 768(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 776(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 784(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 792(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R14), BP
VMOVDQU Y4, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y5, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y6, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y7, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 192(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 216(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x10_loop
VZEROUPPER
mulAvxGFNI_10x10_end:
RET
// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·mulGFNI_10x10_64Xor(SB), $8-88
// Loading 20 of 100 tables to registers
// Destination kept on stack
// Full registers estimated 112 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x06, AX
TESTQ AX, AX
JZ mulGFNI_10x10_64Xor_end
VBROADCASTF32X2 (CX), Z0
VBROADCASTF32X2 8(CX), Z1
VBROADCASTF32X2 16(CX), Z2
VBROADCASTF32X2 24(CX), Z3
VBROADCASTF32X2 32(CX), Z4
VBROADCASTF32X2 40(CX), Z5
VBROADCASTF32X2 48(CX), Z6
VBROADCASTF32X2 56(CX), Z7
VBROADCASTF32X2 64(CX), Z8
VBROADCASTF32X2 72(CX), Z9
VBROADCASTF32X2 80(CX), Z10
VBROADCASTF32X2 88(CX), Z11
VBROADCASTF32X2 96(CX), Z12
VBROADCASTF32X2 104(CX), Z13
VBROADCASTF32X2 112(CX), Z14
VBROADCASTF32X2 120(CX), Z15
VBROADCASTF32X2 128(CX), Z16
VBROADCASTF32X2 136(CX), Z17
VBROADCASTF32X2 144(CX), Z18
VBROADCASTF32X2 152(CX), Z19
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulGFNI_10x10_64Xor_loop:
// Load 10 outputs
MOVQ (R14), BP
VMOVDQU64 (BP)(R15*1), Z20
MOVQ 24(R14), BP
VMOVDQU64 (BP)(R15*1), Z21
MOVQ 48(R14), BP
VMOVDQU64 (BP)(R15*1), Z22
MOVQ 72(R14), BP
VMOVDQU64 (BP)(R15*1), Z23
MOVQ 96(R14), BP
VMOVDQU64 (BP)(R15*1), Z24
MOVQ 120(R14), BP
VMOVDQU64 (BP)(R15*1), Z25
MOVQ 144(R14), BP
VMOVDQU64 (BP)(R15*1), Z26
MOVQ 168(R14), BP
VMOVDQU64 (BP)(R15*1), Z27
MOVQ 192(R14), BP
VMOVDQU64 (BP)(R15*1), Z28
MOVQ 216(R14), BP
VMOVDQU64 (BP)(R15*1), Z29
// Load and process 64 bytes from input 0 to 10 outputs
VMOVDQU64 (BX), Z30
ADDQ $0x40, BX
VGF2P8AFFINEQB $0x00, Z0, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z1, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z2, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z3, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z4, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z5, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z6, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z7, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z8, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z9, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 1 to 10 outputs
VMOVDQU64 (SI), Z30
ADDQ $0x40, SI
VGF2P8AFFINEQB $0x00, Z10, Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB $0x00, Z11, Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB $0x00, Z12, Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB $0x00, Z13, Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB $0x00, Z14, Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB $0x00, Z15, Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB $0x00, Z16, Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB $0x00, Z17, Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB $0x00, Z18, Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB $0x00, Z19, Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 2 to 10 outputs
VMOVDQU64 (DI), Z30
ADDQ $0x40, DI
VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 3 to 10 outputs
VMOVDQU64 (R8), Z30
ADDQ $0x40, R8
VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 4 to 10 outputs
VMOVDQU64 (R9), Z30
ADDQ $0x40, R9
VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 5 to 10 outputs
VMOVDQU64 (R10), Z30
ADDQ $0x40, R10
VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 6 to 10 outputs
VMOVDQU64 (R11), Z30
ADDQ $0x40, R11
VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 7 to 10 outputs
VMOVDQU64 (R12), Z30
ADDQ $0x40, R12
VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 8 to 10 outputs
VMOVDQU64 (R13), Z30
ADDQ $0x40, R13
VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Load and process 64 bytes from input 9 to 10 outputs
VMOVDQU64 (DX), Z30
ADDQ $0x40, DX
VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31
VXORPD Z20, Z31, Z20
VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31
VXORPD Z21, Z31, Z21
VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31
VXORPD Z22, Z31, Z22
VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31
VXORPD Z23, Z31, Z23
VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31
VXORPD Z24, Z31, Z24
VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31
VXORPD Z25, Z31, Z25
VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31
VXORPD Z26, Z31, Z26
VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31
VXORPD Z27, Z31, Z27
VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31
VXORPD Z28, Z31, Z28
VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31
VXORPD Z29, Z31, Z29
// Store 10 outputs
MOVQ (R14), BP
VMOVDQU64 Z20, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU64 Z21, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU64 Z22, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU64 Z23, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU64 Z24, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU64 Z25, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU64 Z26, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU64 Z27, (BP)(R15*1)
MOVQ 192(R14), BP
VMOVDQU64 Z28, (BP)(R15*1)
MOVQ 216(R14), BP
VMOVDQU64 Z29, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x40, R15
DECQ AX
JNZ mulGFNI_10x10_64Xor_loop
VZEROUPPER
mulGFNI_10x10_64Xor_end:
RET
// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int)
// Requires: AVX, GFNI
TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88
// Loading 4 of 100 tables to registers
// Destination kept on stack
// Full registers estimated 112 YMM used
MOVQ n+80(FP), AX
MOVQ matrix_base+0(FP), CX
SHRQ $0x05, AX
TESTQ AX, AX
JZ mulAvxGFNI_10x10Xor_end
VBROADCASTSD (CX), Y0
VBROADCASTSD 8(CX), Y1
VBROADCASTSD 16(CX), Y2
VBROADCASTSD 24(CX), Y3
MOVQ in_base+24(FP), DX
MOVQ (DX), BX
MOVQ 24(DX), SI
MOVQ 48(DX), DI
MOVQ 72(DX), R8
MOVQ 96(DX), R9
MOVQ 120(DX), R10
MOVQ 144(DX), R11
MOVQ 168(DX), R12
MOVQ 192(DX), R13
MOVQ 216(DX), DX
MOVQ out_base+48(FP), R14
MOVQ out_base+48(FP), R14
MOVQ start+72(FP), R15
// Add start offset to input
ADDQ R15, BX
ADDQ R15, SI
ADDQ R15, DI
ADDQ R15, R8
ADDQ R15, R9
ADDQ R15, R10
ADDQ R15, R11
ADDQ R15, R12
ADDQ R15, R13
ADDQ R15, DX
mulAvxGFNI_10x10Xor_loop:
// Load 10 outputs
MOVQ (R14), BP
VMOVDQU (BP)(R15*1), Y4
MOVQ 24(R14), BP
VMOVDQU (BP)(R15*1), Y5
MOVQ 48(R14), BP
VMOVDQU (BP)(R15*1), Y6
MOVQ 72(R14), BP
VMOVDQU (BP)(R15*1), Y7
MOVQ 96(R14), BP
VMOVDQU (BP)(R15*1), Y8
MOVQ 120(R14), BP
VMOVDQU (BP)(R15*1), Y9
MOVQ 144(R14), BP
VMOVDQU (BP)(R15*1), Y10
MOVQ 168(R14), BP
VMOVDQU (BP)(R15*1), Y11
MOVQ 192(R14), BP
VMOVDQU (BP)(R15*1), Y12
MOVQ 216(R14), BP
VMOVDQU (BP)(R15*1), Y13
// Load and process 32 bytes from input 0 to 10 outputs
VMOVDQU (BX), Y14
ADDQ $0x20, BX
VGF2P8AFFINEQB $0x00, Y0, Y14, Y15
VXORPD Y4, Y15, Y4
VGF2P8AFFINEQB $0x00, Y1, Y14, Y15
VXORPD Y5, Y15, Y5
VGF2P8AFFINEQB $0x00, Y2, Y14, Y15
VXORPD Y6, Y15, Y6
VGF2P8AFFINEQB $0x00, Y3, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 32(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 40(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 48(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 56(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 64(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 72(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 1 to 10 outputs
VMOVDQU (SI), Y14
ADDQ $0x20, SI
VBROADCASTSD 80(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 88(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 96(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 104(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 112(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 120(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 128(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 136(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 144(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 152(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 2 to 10 outputs
VMOVDQU (DI), Y14
ADDQ $0x20, DI
VBROADCASTSD 160(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 168(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 176(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 184(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 192(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 200(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 208(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 216(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 224(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 232(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 3 to 10 outputs
VMOVDQU (R8), Y14
ADDQ $0x20, R8
VBROADCASTSD 240(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 248(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 256(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 264(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 272(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 280(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 288(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 296(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 304(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 312(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 4 to 10 outputs
VMOVDQU (R9), Y14
ADDQ $0x20, R9
VBROADCASTSD 320(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 328(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 336(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 344(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 352(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 360(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 368(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 376(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 384(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 392(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 5 to 10 outputs
VMOVDQU (R10), Y14
ADDQ $0x20, R10
VBROADCASTSD 400(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 408(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 416(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 424(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 432(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 440(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 448(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 456(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 464(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 472(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 6 to 10 outputs
VMOVDQU (R11), Y14
ADDQ $0x20, R11
VBROADCASTSD 480(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 488(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 496(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 504(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 512(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 520(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 528(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 536(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 544(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 552(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 7 to 10 outputs
VMOVDQU (R12), Y14
ADDQ $0x20, R12
VBROADCASTSD 560(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 568(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 576(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 584(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 592(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 600(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 608(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 616(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 624(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 632(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 8 to 10 outputs
VMOVDQU (R13), Y14
ADDQ $0x20, R13
VBROADCASTSD 640(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 648(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 656(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 664(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 672(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 680(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 688(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 696(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 704(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 712(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Load and process 32 bytes from input 9 to 10 outputs
VMOVDQU (DX), Y14
ADDQ $0x20, DX
VBROADCASTSD 720(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y4, Y15, Y4
VBROADCASTSD 728(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y5, Y15, Y5
VBROADCASTSD 736(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y6, Y15, Y6
VBROADCASTSD 744(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y7, Y15, Y7
VBROADCASTSD 752(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y8, Y15, Y8
VBROADCASTSD 760(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y9, Y15, Y9
VBROADCASTSD 768(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y10, Y15, Y10
VBROADCASTSD 776(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y11, Y15, Y11
VBROADCASTSD 784(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y12, Y15, Y12
VBROADCASTSD 792(CX), Y15
VGF2P8AFFINEQB $0x00, Y15, Y14, Y15
VXORPD Y13, Y15, Y13
// Store 10 outputs
MOVQ (R14), BP
VMOVDQU Y4, (BP)(R15*1)
MOVQ 24(R14), BP
VMOVDQU Y5, (BP)(R15*1)
MOVQ 48(R14), BP
VMOVDQU Y6, (BP)(R15*1)
MOVQ 72(R14), BP
VMOVDQU Y7, (BP)(R15*1)
MOVQ 96(R14), BP
VMOVDQU Y8, (BP)(R15*1)
MOVQ 120(R14), BP
VMOVDQU Y9, (BP)(R15*1)
MOVQ 144(R14), BP
VMOVDQU Y10, (BP)(R15*1)
MOVQ 168(R14), BP
VMOVDQU Y11, (BP)(R15*1)
MOVQ 192(R14), BP
VMOVDQU Y12, (BP)(R15*1)
MOVQ 216(R14), BP
VMOVDQU Y13, (BP)(R15*1)
// Prepare for next loop
ADDQ $0x20, R15
DECQ AX
JNZ mulAvxGFNI_10x10Xor_loop
VZEROUPPER
mulAvxGFNI_10x10Xor_end:
RET
// func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t01+32(FP), Z0
VBROADCASTF32X2 t23+40(FP), Z1
VBROADCASTF32X2 t02+48(FP), Z2
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z3
VMOVDQU64 (DI), Z4
VMOVDQU64 (R8), Z5
VMOVDQU64 (AX), Z6
VXORPD Z4, Z3, Z4
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z4, Z7
VXORPD Z3, Z7, Z3
VXORPD Z5, Z6, Z6
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
VPTERNLOGD $0x96, Z7, Z3, Z5
VXORPD Z4, Z6, Z6
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z2, Z5, Z7
VXORPD Z3, Z7, Z3
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
VXORPD Z4, Z7, Z4
VMOVDQU64 Z3, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z4, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z5, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z6, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t01+32(FP), Z0
VBROADCASTF32X2 t23+40(FP), Z1
VBROADCASTF32X2 t02+48(FP), Z2
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z3
VMOVDQU64 (DI), Z4
VMOVDQU64 (R8), Z5
VMOVDQU64 (AX), Z6
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z2, Z5, Z7
VXORPD Z3, Z7, Z3
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z2, Z6, Z7
VXORPD Z4, Z7, Z4
VXORPD Z3, Z5, Z5
VXORPD Z4, Z6, Z6
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z4, Z7
VXORPD Z3, Z7, Z3
VXORPD Z4, Z3, Z4
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z6, Z7
VXORPD Z5, Z7, Z5
VXORPD Z5, Z6, Z6
VMOVDQU64 Z3, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z4, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z5, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z6, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t23+40(FP), Z0
VBROADCASTF32X2 t02+48(FP), Z1
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z2
VMOVDQU64 (DI), Z3
VMOVDQU64 (R8), Z4
VMOVDQU64 (AX), Z5
VXORPD Z3, Z2, Z3
VXORPD Z4, Z5, Z5
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z5, Z6
VPTERNLOGD $0x96, Z6, Z2, Z4
VXORPD Z3, Z5, Z5
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
VXORPD Z2, Z6, Z2
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
VXORPD Z3, Z6, Z3
VMOVDQU64 Z2, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z3, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z4, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z5, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t01+32(FP), Z0
VBROADCASTF32X2 t23+40(FP), Z1
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z2
VMOVDQU64 (DI), Z3
VMOVDQU64 (R8), Z4
VMOVDQU64 (AX), Z5
VXORPD Z2, Z4, Z4
VXORPD Z3, Z5, Z5
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
VXORPD Z2, Z6, Z2
VXORPD Z3, Z2, Z3
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
VXORPD Z4, Z6, Z4
VXORPD Z4, Z5, Z5
VMOVDQU64 Z2, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z3, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z4, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z5, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t01+32(FP), Z0
VBROADCASTF32X2 t02+48(FP), Z1
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z2
VMOVDQU64 (DI), Z3
VMOVDQU64 (R8), Z4
VMOVDQU64 (AX), Z5
VXORPD Z3, Z2, Z3
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
VXORPD Z2, Z6, Z2
VXORPD Z4, Z5, Z5
VXORPD Z2, Z4, Z4
VXORPD Z3, Z5, Z5
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
VXORPD Z2, Z6, Z2
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
VXORPD Z3, Z6, Z3
VMOVDQU64 Z2, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z3, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z4, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z5, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t23+40(FP), Z0
VBROADCASTF32X2 t02+48(FP), Z1
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z2
VMOVDQU64 (DI), Z3
VMOVDQU64 (R8), Z4
VMOVDQU64 (AX), Z5
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
VXORPD Z2, Z6, Z2
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
VXORPD Z3, Z6, Z3
VXORPD Z2, Z4, Z4
VXORPD Z3, Z5, Z5
VXORPD Z3, Z2, Z3
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z5, Z6
VXORPD Z4, Z6, Z4
VXORPD Z4, Z5, Z5
VMOVDQU64 Z2, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z3, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z4, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z5, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t02+48(FP), Z0
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z1
VMOVDQU64 (DI), Z2
VMOVDQU64 (R8), Z3
VMOVDQU64 (AX), Z4
VXORPD Z2, Z1, Z2
VXORPD Z3, Z4, Z4
VXORPD Z1, Z3, Z3
VXORPD Z2, Z4, Z4
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z3, Z5
VXORPD Z1, Z5, Z1
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
VXORPD Z2, Z5, Z2
VMOVDQU64 Z1, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z2, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z3, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z4, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t23+40(FP), Z0
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z1
VMOVDQU64 (DI), Z2
VMOVDQU64 (R8), Z3
VMOVDQU64 (AX), Z4
VXORPD Z1, Z3, Z3
VXORPD Z2, Z4, Z4
VXORPD Z2, Z1, Z2
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
VXORPD Z3, Z5, Z3
VXORPD Z3, Z4, Z4
VMOVDQU64 Z1, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z2, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z3, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z4, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t01+32(FP), Z0
VBROADCASTF32X2 t23+40(FP), Z1
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z2
VMOVDQU64 (DI), Z3
VMOVDQU64 (R8), Z4
VMOVDQU64 (AX), Z5
VXORPD Z3, Z2, Z3
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
VXORPD Z2, Z6, Z2
VXORPD Z4, Z5, Z5
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
VPTERNLOGD $0x96, Z6, Z2, Z4
VXORPD Z3, Z5, Z5
VMOVDQU64 Z2, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z3, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z4, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z5, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t01+32(FP), Z0
VBROADCASTF32X2 t02+48(FP), Z1
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z2
VMOVDQU64 (DI), Z3
VMOVDQU64 (R8), Z4
VMOVDQU64 (AX), Z5
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z4, Z6
VXORPD Z2, Z6, Z2
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z1, Z5, Z6
VXORPD Z3, Z6, Z3
VXORPD Z2, Z4, Z4
VXORPD Z3, Z5, Z5
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z3, Z6
VXORPD Z2, Z6, Z2
VXORPD Z3, Z2, Z3
VXORPD Z4, Z5, Z5
VMOVDQU64 Z2, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z3, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z4, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z5, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t23+40(FP), Z0
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z1
VMOVDQU64 (DI), Z2
VMOVDQU64 (R8), Z3
VMOVDQU64 (AX), Z4
VXORPD Z2, Z1, Z2
VXORPD Z3, Z4, Z4
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
VPTERNLOGD $0x96, Z5, Z1, Z3
VXORPD Z2, Z4, Z4
VMOVDQU64 Z1, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z2, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z3, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z4, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t01+32(FP), Z0
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z1
VMOVDQU64 (DI), Z2
VMOVDQU64 (R8), Z3
VMOVDQU64 (AX), Z4
VXORPD Z1, Z3, Z3
VXORPD Z2, Z4, Z4
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z2, Z5
VXORPD Z1, Z5, Z1
VXORPD Z2, Z1, Z2
VXORPD Z3, Z4, Z4
VMOVDQU64 Z1, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z2, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z3, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z4, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t01+32(FP), Z0
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z1
VMOVDQU64 (DI), Z2
VMOVDQU64 (R8), Z3
VMOVDQU64 (AX), Z4
VXORPD Z2, Z1, Z2
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z2, Z5
VXORPD Z1, Z5, Z1
VXORPD Z3, Z4, Z4
VXORPD Z1, Z3, Z3
VXORPD Z2, Z4, Z4
VMOVDQU64 Z1, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z2, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z3, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z4, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F, GFNI
TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56
VBROADCASTF32X2 t02+48(FP), Z0
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z1
VMOVDQU64 (DI), Z2
VMOVDQU64 (R8), Z3
VMOVDQU64 (AX), Z4
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z3, Z5
VXORPD Z1, Z5, Z1
// LEO_MULADD_512
VGF2P8AFFINEQB $0x00, Z0, Z4, Z5
VXORPD Z2, Z5, Z2
VXORPD Z1, Z3, Z3
VXORPD Z2, Z4, Z4
VXORPD Z2, Z1, Z2
VXORPD Z3, Z4, Z4
VMOVDQU64 Z1, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z2, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z3, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z4, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F
TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z0
VMOVDQU64 (DI), Z1
VMOVDQU64 (R8), Z2
VMOVDQU64 (AX), Z3
VXORPD Z1, Z0, Z1
VXORPD Z2, Z3, Z3
VXORPD Z0, Z2, Z2
VXORPD Z1, Z3, Z3
VMOVDQU64 Z0, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z1, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z2, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z3, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET
// func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64)
// Requires: AVX, AVX512DQ, AVX512F
TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56
MOVQ dist+24(FP), AX
MOVQ work_base+0(FP), CX
MOVQ 8(CX), DX
XORQ BX, BX
MOVQ (CX)(BX*1), SI
ADDQ AX, BX
MOVQ (CX)(BX*1), DI
ADDQ AX, BX
MOVQ (CX)(BX*1), R8
ADDQ AX, BX
MOVQ (CX)(BX*1), AX
loop:
VMOVDQU64 (SI), Z0
VMOVDQU64 (DI), Z1
VMOVDQU64 (R8), Z2
VMOVDQU64 (AX), Z3
VXORPD Z0, Z2, Z2
VXORPD Z1, Z3, Z3
VXORPD Z1, Z0, Z1
VXORPD Z2, Z3, Z3
VMOVDQU64 Z0, (SI)
ADDQ $0x40, SI
VMOVDQU64 Z1, (DI)
ADDQ $0x40, DI
VMOVDQU64 Z2, (R8)
ADDQ $0x40, R8
VMOVDQU64 Z3, (AX)
ADDQ $0x40, AX
SUBQ $0x40, DX
JA loop
VZEROUPPER
RET