// Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. //go:build !appengine && !noasm && !nogen && !nopshufb && gc #include "textflag.h" // func _dummy_() TEXT ·_dummy_(SB), $0 #ifdef GOAMD64_v4 #define XOR3WAY(ignore, a, b, dst) \ VPTERNLOGD $0x96, a, b, dst #else #define XOR3WAY(ignore, a, b, dst) \ VPXOR a, dst, dst \ VPXOR b, dst, dst #endif RET // sSE2XorSlice will XOR in with out and store in out. // Processes 16 bytes/loop. // func sSE2XorSlice(in []byte, out []byte) // Requires: SSE2 TEXT ·sSE2XorSlice(SB), $0-48 MOVQ in_base+0(FP), AX MOVQ out_base+24(FP), CX MOVQ in_len+8(FP), DX SHRQ $0x04, DX JZ end loop: MOVOU (AX), X0 MOVOU (CX), X1 PXOR X0, X1 MOVOU X1, (CX) ADDQ $0x10, AX ADDQ $0x10, CX DECQ DX JNZ loop end: RET // sSE2XorSlice_64 will XOR in with out and store in out. // Processes 64 bytes/loop. // func sSE2XorSlice_64(in []byte, out []byte) // Requires: SSE2 TEXT ·sSE2XorSlice_64(SB), $0-48 MOVQ in_base+0(FP), AX MOVQ out_base+24(FP), CX MOVQ in_len+8(FP), DX SHRQ $0x06, DX JZ end loop: MOVOU (AX), X0 MOVOU 16(AX), X2 MOVOU 32(AX), X4 MOVOU 48(AX), X6 MOVOU (CX), X1 MOVOU 16(CX), X3 MOVOU 32(CX), X5 MOVOU 48(CX), X7 PXOR X0, X1 PXOR X2, X3 PXOR X4, X5 PXOR X6, X7 MOVOU X1, (CX) MOVOU X3, 16(CX) MOVOU X5, 32(CX) MOVOU X7, 48(CX) ADDQ $0x40, AX ADDQ $0x40, CX DECQ DX JNZ loop end: RET // avx2XorSlice_64 will XOR in with out and store in out. // Processes 64 bytes/loop. // func avx2XorSlice_64(in []byte, out []byte) // Requires: AVX, AVX2 TEXT ·avx2XorSlice_64(SB), $0-48 MOVQ in_base+0(FP), AX MOVQ out_base+24(FP), CX MOVQ in_len+8(FP), DX SHRQ $0x06, DX JZ end loop: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y3 VPXOR Y0, Y1, Y1 VPXOR Y2, Y3, Y3 VMOVDQU Y1, (CX) VMOVDQU Y3, 32(CX) ADDQ $0x40, AX ADDQ $0x40, CX DECQ DX JNZ loop end: VZEROUPPER RET // func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x1_64_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input ADDQ BX, CX MOVQ $0x0000000f, BX MOVQ BX, X4 VPBROADCASTB X4, Y4 mulAvxTwo_1x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (CX), Y2 VMOVDQU 32(CX), Y3 ADDQ $0x40, CX VPSRLQ $0x04, Y2, Y6 VPSRLQ $0x04, Y3, Y5 VPAND Y4, Y2, Y2 VPAND Y4, Y3, Y3 VPAND Y4, Y6, Y6 VPAND Y4, Y5, Y5 VPSHUFB Y2, Y0, Y2 VPSHUFB Y3, Y0, Y3 VPSHUFB Y6, Y1, Y6 VPSHUFB Y5, Y1, Y5 VPXOR Y2, Y6, Y2 VPXOR Y3, Y5, Y3 // Store 1 outputs VMOVDQU Y2, (DX) VMOVDQU Y3, 32(DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x1_64_loop VZEROUPPER mulAvxTwo_1x1_64_end: RET // func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 4 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x1_64_end VBROADCASTF32X2 (CX), Z0 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input ADDQ BX, CX mulGFNI_1x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (CX), Z1 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z1, Z1 // Store 1 outputs VMOVDQU64 Z1, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x1_64_loop VZEROUPPER mulGFNI_1x1_64_end: RET // func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 4 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x1_end VBROADCASTSD (CX), Y0 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input ADDQ BX, CX mulAvxGFNI_1x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (CX), Y1 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 // Store 1 outputs VMOVDQU Y1, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x1_loop VZEROUPPER mulAvxGFNI_1x1_end: RET // func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 4 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x1_64Xor_end VBROADCASTF32X2 (CX), Z0 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input ADDQ BX, CX mulGFNI_1x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (DX), Z1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (CX), Z2 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z2, Z2 VXORPD Z1, Z2, Z1 // Store 1 outputs VMOVDQU64 Z1, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x1_64Xor_loop VZEROUPPER mulGFNI_1x1_64Xor_end: RET // func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 4 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x1Xor_end VBROADCASTSD (CX), Y0 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input ADDQ BX, CX mulAvxGFNI_1x1Xor_loop: // Load 1 outputs VMOVDQU (DX), Y1 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (CX), Y2 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 VXORPD Y1, Y2, Y1 // Store 1 outputs VMOVDQU Y1, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x1Xor_loop VZEROUPPER mulAvxGFNI_1x1Xor_end: RET // func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x1_64Xor_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input ADDQ BX, CX MOVQ $0x0000000f, BX MOVQ BX, X4 VPBROADCASTB X4, Y4 mulAvxTwo_1x1_64Xor_loop: // Load 1 outputs VMOVDQU (DX), Y2 VMOVDQU 32(DX), Y3 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y7 ADDQ $0x40, CX VPSRLQ $0x04, Y5, Y6 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y5, Y5 VPAND Y4, Y7, Y7 VPAND Y4, Y6, Y6 VPAND Y4, Y8, Y8 VPSHUFB Y5, Y0, Y5 VPSHUFB Y7, Y0, Y7 VPSHUFB Y6, Y1, Y6 VPSHUFB Y8, Y1, Y8 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 1 outputs VMOVDQU Y2, (DX) VMOVDQU Y3, 32(DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x1_64Xor_loop VZEROUPPER mulAvxTwo_1x1_64Xor_end: RET // func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input ADDQ DI, DX MOVQ $0x0000000f, DI MOVQ DI, X4 VPBROADCASTB X4, Y4 mulAvxTwo_1x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y9 ADDQ $0x40, DX VPSRLQ $0x04, Y7, Y8 VPSRLQ $0x04, Y9, Y10 VPAND Y4, Y7, Y7 VPAND Y4, Y9, Y9 VPAND Y4, Y8, Y8 VPAND Y4, Y10, Y10 VMOVDQU (CX), Y2 VMOVDQU 32(CX), Y6 VPSHUFB Y9, Y2, Y3 VPSHUFB Y7, Y2, Y2 VPSHUFB Y10, Y6, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y2, Y6, Y0 VPXOR Y3, Y5, Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y6 VPSHUFB Y9, Y2, Y3 VPSHUFB Y7, Y2, Y2 VPSHUFB Y10, Y6, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y2, Y6, Y2 VPXOR Y3, Y5, Y3 // Store 2 outputs VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (BX) VMOVDQU Y3, 32(BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x2_64_loop VZEROUPPER mulAvxTwo_1x2_64_end: RET // func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX ADDQ SI, DX // Add start offset to input ADDQ SI, CX mulGFNI_1x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (CX), Z3 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 // Store 2 outputs VMOVDQU64 Z2, (BX) ADDQ $0x40, BX VMOVDQU64 Z3, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x2_64_loop VZEROUPPER mulGFNI_1x2_64_end: RET // func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x2(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX ADDQ SI, DX // Add start offset to input ADDQ SI, CX mulAvxGFNI_1x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (CX), Y3 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 // Store 2 outputs VMOVDQU Y2, (BX) ADDQ $0x20, BX VMOVDQU Y3, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x2_loop VZEROUPPER mulAvxGFNI_1x2_end: RET // func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX ADDQ SI, DX // Add start offset to input ADDQ SI, CX mulGFNI_1x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (BX), Z2 VMOVDQU64 (DX), Z3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (CX), Z4 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 VXORPD Z2, Z5, Z2 VGF2P8AFFINEQB $0x00, Z1, Z4, Z5 VXORPD Z3, Z5, Z3 // Store 2 outputs VMOVDQU64 Z2, (BX) ADDQ $0x40, BX VMOVDQU64 Z3, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x2_64Xor_loop VZEROUPPER mulGFNI_1x2_64Xor_end: RET // func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX ADDQ SI, DX // Add start offset to input ADDQ SI, CX mulAvxGFNI_1x2Xor_loop: // Load 2 outputs VMOVDQU (BX), Y2 VMOVDQU (DX), Y3 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (CX), Y4 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 VXORPD Y2, Y5, Y2 VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 VXORPD Y3, Y5, Y3 // Store 2 outputs VMOVDQU Y2, (BX) ADDQ $0x20, BX VMOVDQU Y3, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x2Xor_loop VZEROUPPER mulAvxGFNI_1x2Xor_end: RET // func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input ADDQ DI, DX MOVQ $0x0000000f, DI MOVQ DI, X4 VPBROADCASTB X4, Y4 mulAvxTwo_1x2_64Xor_loop: // Load 2 outputs VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y1 VMOVDQU (BX), Y2 VMOVDQU 32(BX), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (BX) VMOVDQU Y3, 32(BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x2_64Xor_loop VZEROUPPER mulAvxTwo_1x2_64Xor_end: RET // func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, SI ADDQ R8, DI ADDQ R8, BX // Add start offset to input ADDQ R8, DX MOVQ $0x0000000f, R8 MOVQ R8, X6 VPBROADCASTB X6, Y6 mulAvxTwo_1x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y6, Y9, Y9 VPAND Y6, Y11, Y11 VPAND Y6, Y10, Y10 VPAND Y6, Y12, Y12 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y8 VPSHUFB Y11, Y4, Y5 VPSHUFB Y9, Y4, Y4 VPSHUFB Y12, Y8, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y4, Y8, Y0 VPXOR Y5, Y7, Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y8 VPSHUFB Y11, Y4, Y5 VPSHUFB Y9, Y4, Y4 VPSHUFB Y12, Y8, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y4, Y8, Y2 VPXOR Y5, Y7, Y3 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y8 VPSHUFB Y11, Y4, Y5 VPSHUFB Y9, Y4, Y4 VPSHUFB Y12, Y8, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y4, Y8, Y4 VPXOR Y5, Y7, Y5 // Store 3 outputs VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI VMOVDQU Y4, (BX) VMOVDQU Y5, 32(BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x3_64_loop VZEROUPPER mulAvxTwo_1x3_64_end: RET // func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, BX ADDQ DI, SI ADDQ DI, DX // Add start offset to input ADDQ DI, CX mulGFNI_1x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (CX), Z5 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z5, Z3 VGF2P8AFFINEQB $0x00, Z1, Z5, Z4 VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 // Store 3 outputs VMOVDQU64 Z3, (BX) ADDQ $0x40, BX VMOVDQU64 Z4, (SI) ADDQ $0x40, SI VMOVDQU64 Z5, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x3_64_loop VZEROUPPER mulGFNI_1x3_64_end: RET // func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x3(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, BX ADDQ DI, SI ADDQ DI, DX // Add start offset to input ADDQ DI, CX mulAvxGFNI_1x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (CX), Y5 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 // Store 3 outputs VMOVDQU Y3, (BX) ADDQ $0x20, BX VMOVDQU Y4, (SI) ADDQ $0x20, SI VMOVDQU Y5, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x3_loop VZEROUPPER mulAvxGFNI_1x3_end: RET // func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, BX ADDQ DI, SI ADDQ DI, DX // Add start offset to input ADDQ DI, CX mulGFNI_1x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (BX), Z3 VMOVDQU64 (SI), Z4 VMOVDQU64 (DX), Z5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (CX), Z6 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 VXORPD Z3, Z7, Z3 VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 VXORPD Z4, Z7, Z4 VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 VXORPD Z5, Z7, Z5 // Store 3 outputs VMOVDQU64 Z3, (BX) ADDQ $0x40, BX VMOVDQU64 Z4, (SI) ADDQ $0x40, SI VMOVDQU64 Z5, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x3_64Xor_loop VZEROUPPER mulGFNI_1x3_64Xor_end: RET // func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, BX ADDQ DI, SI ADDQ DI, DX // Add start offset to input ADDQ DI, CX mulAvxGFNI_1x3Xor_loop: // Load 3 outputs VMOVDQU (BX), Y3 VMOVDQU (SI), Y4 VMOVDQU (DX), Y5 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (CX), Y6 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 VXORPD Y3, Y7, Y3 VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 VXORPD Y4, Y7, Y4 VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 VXORPD Y5, Y7, Y5 // Store 3 outputs VMOVDQU Y3, (BX) ADDQ $0x20, BX VMOVDQU Y4, (SI) ADDQ $0x20, SI VMOVDQU Y5, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x3Xor_loop VZEROUPPER mulAvxGFNI_1x3Xor_end: RET // func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, SI ADDQ R8, DI ADDQ R8, BX // Add start offset to input ADDQ R8, DX MOVQ $0x0000000f, R8 MOVQ R8, X6 VPBROADCASTB X6, Y6 mulAvxTwo_1x3_64Xor_loop: // Load 3 outputs VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y1 VMOVDQU (DI), Y2 VMOVDQU 32(DI), Y3 VMOVDQU (BX), Y4 VMOVDQU 32(BX), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI VMOVDQU Y4, (BX) VMOVDQU Y5, 32(BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x3_64Xor_loop VZEROUPPER mulAvxTwo_1x3_64Xor_end: RET // func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x4_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), BX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, BX // Add start offset to input ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X4 VPBROADCASTB X4, Y4 mulAvxTwo_1x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y4, Y6, Y6 VPAND Y4, Y7, Y7 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y7, Y5, Y5 VPXOR Y3, Y5, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y7, Y5, Y5 VPXOR Y3, Y5, Y1 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y7, Y5, Y5 VPXOR Y3, Y5, Y2 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y7, Y5, Y5 VPXOR Y3, Y5, Y3 // Store 4 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x4_loop VZEROUPPER mulAvxTwo_1x4_end: RET // func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, BX ADDQ R8, SI ADDQ R8, DI ADDQ R8, DX // Add start offset to input ADDQ R8, CX mulGFNI_1x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (CX), Z7 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z7, Z4 VGF2P8AFFINEQB $0x00, Z1, Z7, Z5 VGF2P8AFFINEQB $0x00, Z2, Z7, Z6 VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 // Store 4 outputs VMOVDQU64 Z4, (BX) ADDQ $0x40, BX VMOVDQU64 Z5, (SI) ADDQ $0x40, SI VMOVDQU64 Z6, (DI) ADDQ $0x40, DI VMOVDQU64 Z7, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x4_64_loop VZEROUPPER mulGFNI_1x4_64_end: RET // func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x4(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, BX ADDQ R8, SI ADDQ R8, DI ADDQ R8, DX // Add start offset to input ADDQ R8, CX mulAvxGFNI_1x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (CX), Y7 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 // Store 4 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX VMOVDQU Y5, (SI) ADDQ $0x20, SI VMOVDQU Y6, (DI) ADDQ $0x20, DI VMOVDQU Y7, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x4_loop VZEROUPPER mulAvxGFNI_1x4_end: RET // func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, BX ADDQ R8, SI ADDQ R8, DI ADDQ R8, DX // Add start offset to input ADDQ R8, CX mulGFNI_1x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (BX), Z4 VMOVDQU64 (SI), Z5 VMOVDQU64 (DI), Z6 VMOVDQU64 (DX), Z7 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (CX), Z8 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 VXORPD Z4, Z9, Z4 VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 VXORPD Z5, Z9, Z5 VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 VXORPD Z6, Z9, Z6 VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 VXORPD Z7, Z9, Z7 // Store 4 outputs VMOVDQU64 Z4, (BX) ADDQ $0x40, BX VMOVDQU64 Z5, (SI) ADDQ $0x40, SI VMOVDQU64 Z6, (DI) ADDQ $0x40, DI VMOVDQU64 Z7, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x4_64Xor_loop VZEROUPPER mulGFNI_1x4_64Xor_end: RET // func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, BX ADDQ R8, SI ADDQ R8, DI ADDQ R8, DX // Add start offset to input ADDQ R8, CX mulAvxGFNI_1x4Xor_loop: // Load 4 outputs VMOVDQU (BX), Y4 VMOVDQU (SI), Y5 VMOVDQU (DI), Y6 VMOVDQU (DX), Y7 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 VXORPD Y4, Y9, Y4 VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 VXORPD Y5, Y9, Y5 VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 VXORPD Y6, Y9, Y6 VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 VXORPD Y7, Y9, Y7 // Store 4 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX VMOVDQU Y5, (SI) ADDQ $0x20, SI VMOVDQU Y6, (DI) ADDQ $0x20, DI VMOVDQU Y7, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x4Xor_loop VZEROUPPER mulAvxGFNI_1x4Xor_end: RET // func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), BX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, BX // Add start offset to input ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X4 VPBROADCASTB X4, Y4 mulAvxTwo_1x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (SI), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (BX), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x4Xor_loop VZEROUPPER mulAvxTwo_1x4Xor_end: RET // func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x5_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), BX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, BX // Add start offset to input ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X5 VPBROADCASTB X5, Y5 mulAvxTwo_1x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y5, Y7, Y7 VPAND Y5, Y8, Y8 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y4, Y4 VPSHUFB Y8, Y6, Y6 VPXOR Y4, Y6, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y4, Y4 VPSHUFB Y8, Y6, Y6 VPXOR Y4, Y6, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y4, Y4 VPSHUFB Y8, Y6, Y6 VPXOR Y4, Y6, Y2 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y4, Y4 VPSHUFB Y8, Y6, Y6 VPXOR Y4, Y6, Y3 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y4, Y4 VPSHUFB Y8, Y6, Y6 VPXOR Y4, Y6, Y4 // Store 5 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x5_loop VZEROUPPER mulAvxTwo_1x5_end: RET // func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, DX // Add start offset to input ADDQ R9, CX mulGFNI_1x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (CX), Z9 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z9, Z5 VGF2P8AFFINEQB $0x00, Z1, Z9, Z6 VGF2P8AFFINEQB $0x00, Z2, Z9, Z7 VGF2P8AFFINEQB $0x00, Z3, Z9, Z8 VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 // Store 5 outputs VMOVDQU64 Z5, (BX) ADDQ $0x40, BX VMOVDQU64 Z6, (SI) ADDQ $0x40, SI VMOVDQU64 Z7, (DI) ADDQ $0x40, DI VMOVDQU64 Z8, (R8) ADDQ $0x40, R8 VMOVDQU64 Z9, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x5_64_loop VZEROUPPER mulGFNI_1x5_64_end: RET // func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x5(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, DX // Add start offset to input ADDQ R9, CX mulAvxGFNI_1x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (CX), Y9 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 // Store 5 outputs VMOVDQU Y5, (BX) ADDQ $0x20, BX VMOVDQU Y6, (SI) ADDQ $0x20, SI VMOVDQU Y7, (DI) ADDQ $0x20, DI VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x5_loop VZEROUPPER mulAvxGFNI_1x5_end: RET // func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, DX // Add start offset to input ADDQ R9, CX mulGFNI_1x5_64Xor_loop: // Load 5 outputs VMOVDQU64 (BX), Z5 VMOVDQU64 (SI), Z6 VMOVDQU64 (DI), Z7 VMOVDQU64 (R8), Z8 VMOVDQU64 (DX), Z9 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (CX), Z10 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 VXORPD Z5, Z11, Z5 VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 VXORPD Z6, Z11, Z6 VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 VXORPD Z7, Z11, Z7 VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 VXORPD Z8, Z11, Z8 VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 VXORPD Z9, Z11, Z9 // Store 5 outputs VMOVDQU64 Z5, (BX) ADDQ $0x40, BX VMOVDQU64 Z6, (SI) ADDQ $0x40, SI VMOVDQU64 Z7, (DI) ADDQ $0x40, DI VMOVDQU64 Z8, (R8) ADDQ $0x40, R8 VMOVDQU64 Z9, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x5_64Xor_loop VZEROUPPER mulGFNI_1x5_64Xor_end: RET // func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, DX // Add start offset to input ADDQ R9, CX mulAvxGFNI_1x5Xor_loop: // Load 5 outputs VMOVDQU (BX), Y5 VMOVDQU (SI), Y6 VMOVDQU (DI), Y7 VMOVDQU (R8), Y8 VMOVDQU (DX), Y9 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 VXORPD Y5, Y11, Y5 VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 VXORPD Y6, Y11, Y6 VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 VXORPD Y7, Y11, Y7 VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 VXORPD Y8, Y11, Y8 VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 VXORPD Y9, Y11, Y9 // Store 5 outputs VMOVDQU Y5, (BX) ADDQ $0x20, BX VMOVDQU Y6, (SI) ADDQ $0x20, SI VMOVDQU Y7, (DI) ADDQ $0x20, DI VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x5Xor_loop VZEROUPPER mulAvxGFNI_1x5Xor_end: RET // func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), BX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, BX // Add start offset to input ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X5 VPBROADCASTB X5, Y5 mulAvxTwo_1x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (SI), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (BX), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x5Xor_loop VZEROUPPER mulAvxTwo_1x5Xor_end: RET // func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x6_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), BX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, BX // Add start offset to input ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X6 VPBROADCASTB X6, Y6 mulAvxTwo_1x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y6, Y8, Y8 VPAND Y6, Y9, Y9 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y5, Y5 VPSHUFB Y9, Y7, Y7 VPXOR Y5, Y7, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y5, Y5 VPSHUFB Y9, Y7, Y7 VPXOR Y5, Y7, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y5, Y5 VPSHUFB Y9, Y7, Y7 VPXOR Y5, Y7, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y5, Y5 VPSHUFB Y9, Y7, Y7 VPXOR Y5, Y7, Y3 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y5, Y5 VPSHUFB Y9, Y7, Y7 VPXOR Y5, Y7, Y4 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y5, Y5 VPSHUFB Y9, Y7, Y7 VPXOR Y5, Y7, Y5 // Store 6 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x6_loop VZEROUPPER mulAvxTwo_1x6_end: RET // func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DX // Add start offset to input ADDQ R10, CX mulGFNI_1x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (CX), Z11 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z11, Z6 VGF2P8AFFINEQB $0x00, Z1, Z11, Z7 VGF2P8AFFINEQB $0x00, Z2, Z11, Z8 VGF2P8AFFINEQB $0x00, Z3, Z11, Z9 VGF2P8AFFINEQB $0x00, Z4, Z11, Z10 VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 // Store 6 outputs VMOVDQU64 Z6, (BX) ADDQ $0x40, BX VMOVDQU64 Z7, (SI) ADDQ $0x40, SI VMOVDQU64 Z8, (DI) ADDQ $0x40, DI VMOVDQU64 Z9, (R8) ADDQ $0x40, R8 VMOVDQU64 Z10, (R9) ADDQ $0x40, R9 VMOVDQU64 Z11, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x6_64_loop VZEROUPPER mulGFNI_1x6_64_end: RET // func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x6(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DX // Add start offset to input ADDQ R10, CX mulAvxGFNI_1x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (CX), Y11 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 // Store 6 outputs VMOVDQU Y6, (BX) ADDQ $0x20, BX VMOVDQU Y7, (SI) ADDQ $0x20, SI VMOVDQU Y8, (DI) ADDQ $0x20, DI VMOVDQU Y9, (R8) ADDQ $0x20, R8 VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x6_loop VZEROUPPER mulAvxGFNI_1x6_end: RET // func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DX // Add start offset to input ADDQ R10, CX mulGFNI_1x6_64Xor_loop: // Load 6 outputs VMOVDQU64 (BX), Z6 VMOVDQU64 (SI), Z7 VMOVDQU64 (DI), Z8 VMOVDQU64 (R8), Z9 VMOVDQU64 (R9), Z10 VMOVDQU64 (DX), Z11 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (CX), Z12 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 VXORPD Z6, Z13, Z6 VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 VXORPD Z7, Z13, Z7 VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 VXORPD Z8, Z13, Z8 VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 VXORPD Z11, Z13, Z11 // Store 6 outputs VMOVDQU64 Z6, (BX) ADDQ $0x40, BX VMOVDQU64 Z7, (SI) ADDQ $0x40, SI VMOVDQU64 Z8, (DI) ADDQ $0x40, DI VMOVDQU64 Z9, (R8) ADDQ $0x40, R8 VMOVDQU64 Z10, (R9) ADDQ $0x40, R9 VMOVDQU64 Z11, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x6_64Xor_loop VZEROUPPER mulGFNI_1x6_64Xor_end: RET // func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DX // Add start offset to input ADDQ R10, CX mulAvxGFNI_1x6Xor_loop: // Load 6 outputs VMOVDQU (BX), Y6 VMOVDQU (SI), Y7 VMOVDQU (DI), Y8 VMOVDQU (R8), Y9 VMOVDQU (R9), Y10 VMOVDQU (DX), Y11 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 VXORPD Y6, Y13, Y6 VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 VXORPD Y7, Y13, Y7 VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 VXORPD Y8, Y13, Y8 VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 VXORPD Y11, Y13, Y11 // Store 6 outputs VMOVDQU Y6, (BX) ADDQ $0x20, BX VMOVDQU Y7, (SI) ADDQ $0x20, SI VMOVDQU Y8, (DI) ADDQ $0x20, DI VMOVDQU Y9, (R8) ADDQ $0x20, R8 VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x6Xor_loop VZEROUPPER mulAvxGFNI_1x6Xor_end: RET // func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), BX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, BX // Add start offset to input ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X6 VPBROADCASTB X6, Y6 mulAvxTwo_1x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (SI), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (BX), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x6Xor_loop VZEROUPPER mulAvxTwo_1x6Xor_end: RET // func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x7_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), BX MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, BX // Add start offset to input ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X7 VPBROADCASTB X7, Y7 mulAvxTwo_1x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y7, Y9, Y9 VPAND Y7, Y10, Y10 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y6, Y6 VPSHUFB Y10, Y8, Y8 VPXOR Y6, Y8, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y6, Y6 VPSHUFB Y10, Y8, Y8 VPXOR Y6, Y8, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y6, Y6 VPSHUFB Y10, Y8, Y8 VPXOR Y6, Y8, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y6, Y6 VPSHUFB Y10, Y8, Y8 VPXOR Y6, Y8, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y6, Y6 VPSHUFB Y10, Y8, Y8 VPXOR Y6, Y8, Y4 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y6, Y6 VPSHUFB Y10, Y8, Y8 VPXOR Y6, Y8, Y5 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y6, Y6 VPSHUFB Y10, Y8, Y8 VPXOR Y6, Y8, Y6 // Store 7 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x7_loop VZEROUPPER mulAvxTwo_1x7_end: RET // func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DX // Add start offset to input ADDQ R11, CX mulGFNI_1x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (CX), Z13 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z13, Z7 VGF2P8AFFINEQB $0x00, Z1, Z13, Z8 VGF2P8AFFINEQB $0x00, Z2, Z13, Z9 VGF2P8AFFINEQB $0x00, Z3, Z13, Z10 VGF2P8AFFINEQB $0x00, Z4, Z13, Z11 VGF2P8AFFINEQB $0x00, Z5, Z13, Z12 VGF2P8AFFINEQB $0x00, Z6, Z13, Z13 // Store 7 outputs VMOVDQU64 Z7, (BX) ADDQ $0x40, BX VMOVDQU64 Z8, (SI) ADDQ $0x40, SI VMOVDQU64 Z9, (DI) ADDQ $0x40, DI VMOVDQU64 Z10, (R8) ADDQ $0x40, R8 VMOVDQU64 Z11, (R9) ADDQ $0x40, R9 VMOVDQU64 Z12, (R10) ADDQ $0x40, R10 VMOVDQU64 Z13, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x7_64_loop VZEROUPPER mulGFNI_1x7_64_end: RET // func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x7(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DX // Add start offset to input ADDQ R11, CX mulAvxGFNI_1x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (CX), Y13 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 // Store 7 outputs VMOVDQU Y7, (BX) ADDQ $0x20, BX VMOVDQU Y8, (SI) ADDQ $0x20, SI VMOVDQU Y9, (DI) ADDQ $0x20, DI VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (R9) ADDQ $0x20, R9 VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x7_loop VZEROUPPER mulAvxGFNI_1x7_end: RET // func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DX // Add start offset to input ADDQ R11, CX mulGFNI_1x7_64Xor_loop: // Load 7 outputs VMOVDQU64 (BX), Z7 VMOVDQU64 (SI), Z8 VMOVDQU64 (DI), Z9 VMOVDQU64 (R8), Z10 VMOVDQU64 (R9), Z11 VMOVDQU64 (R10), Z12 VMOVDQU64 (DX), Z13 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (CX), Z14 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 VXORPD Z7, Z15, Z7 VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 VXORPD Z8, Z15, Z8 VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 VXORPD Z9, Z15, Z9 VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 VXORPD Z10, Z15, Z10 VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 VXORPD Z11, Z15, Z11 VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 VXORPD Z13, Z15, Z13 // Store 7 outputs VMOVDQU64 Z7, (BX) ADDQ $0x40, BX VMOVDQU64 Z8, (SI) ADDQ $0x40, SI VMOVDQU64 Z9, (DI) ADDQ $0x40, DI VMOVDQU64 Z10, (R8) ADDQ $0x40, R8 VMOVDQU64 Z11, (R9) ADDQ $0x40, R9 VMOVDQU64 Z12, (R10) ADDQ $0x40, R10 VMOVDQU64 Z13, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x7_64Xor_loop VZEROUPPER mulGFNI_1x7_64Xor_end: RET // func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DX // Add start offset to input ADDQ R11, CX mulAvxGFNI_1x7Xor_loop: // Load 7 outputs VMOVDQU (BX), Y7 VMOVDQU (SI), Y8 VMOVDQU (DI), Y9 VMOVDQU (R8), Y10 VMOVDQU (R9), Y11 VMOVDQU (R10), Y12 VMOVDQU (DX), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (CX), Y14 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (BX) ADDQ $0x20, BX VMOVDQU Y8, (SI) ADDQ $0x20, SI VMOVDQU Y9, (DI) ADDQ $0x20, DI VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (R9) ADDQ $0x20, R9 VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x7Xor_loop VZEROUPPER mulAvxGFNI_1x7Xor_end: RET // func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), BX MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, BX // Add start offset to input ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X7 VPBROADCASTB X7, Y7 mulAvxTwo_1x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (SI), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (BX), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x7Xor_loop VZEROUPPER mulAvxTwo_1x7Xor_end: RET // func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x8_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), BX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, BX // Add start offset to input ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X8 VPBROADCASTB X8, Y8 mulAvxTwo_1x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y8, Y10, Y10 VPAND Y8, Y11, Y11 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y7, Y7 VPSHUFB Y11, Y9, Y9 VPXOR Y7, Y9, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y7, Y7 VPSHUFB Y11, Y9, Y9 VPXOR Y7, Y9, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y7, Y7 VPSHUFB Y11, Y9, Y9 VPXOR Y7, Y9, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y7, Y7 VPSHUFB Y11, Y9, Y9 VPXOR Y7, Y9, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y7, Y7 VPSHUFB Y11, Y9, Y9 VPXOR Y7, Y9, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y7, Y7 VPSHUFB Y11, Y9, Y9 VPXOR Y7, Y9, Y5 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y7, Y7 VPSHUFB Y11, Y9, Y9 VPXOR Y7, Y9, Y6 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y7, Y7 VPSHUFB Y11, Y9, Y9 VPXOR Y7, Y9, Y7 // Store 8 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x8_loop VZEROUPPER mulAvxTwo_1x8_end: RET // func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DX // Add start offset to input ADDQ R12, CX mulGFNI_1x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (CX), Z15 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z15, Z8 VGF2P8AFFINEQB $0x00, Z1, Z15, Z9 VGF2P8AFFINEQB $0x00, Z2, Z15, Z10 VGF2P8AFFINEQB $0x00, Z3, Z15, Z11 VGF2P8AFFINEQB $0x00, Z4, Z15, Z12 VGF2P8AFFINEQB $0x00, Z5, Z15, Z13 VGF2P8AFFINEQB $0x00, Z6, Z15, Z14 VGF2P8AFFINEQB $0x00, Z7, Z15, Z15 // Store 8 outputs VMOVDQU64 Z8, (BX) ADDQ $0x40, BX VMOVDQU64 Z9, (SI) ADDQ $0x40, SI VMOVDQU64 Z10, (DI) ADDQ $0x40, DI VMOVDQU64 Z11, (R8) ADDQ $0x40, R8 VMOVDQU64 Z12, (R9) ADDQ $0x40, R9 VMOVDQU64 Z13, (R10) ADDQ $0x40, R10 VMOVDQU64 Z14, (R11) ADDQ $0x40, R11 VMOVDQU64 Z15, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x8_64_loop VZEROUPPER mulGFNI_1x8_64_end: RET // func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x8(SB), $0-88 // Loading 6 of 8 tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), BX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, BX // Add start offset to input ADDQ R13, DX mulAvxGFNI_1x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 VBROADCASTSD 56(CX), Y14 VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 // Store 8 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI VMOVDQU Y7, (DI) ADDQ $0x20, DI VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (R9) ADDQ $0x20, R9 VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x8_loop VZEROUPPER mulAvxGFNI_1x8_end: RET // func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DX // Add start offset to input ADDQ R12, CX mulGFNI_1x8_64Xor_loop: // Load 8 outputs VMOVDQU64 (BX), Z8 VMOVDQU64 (SI), Z9 VMOVDQU64 (DI), Z10 VMOVDQU64 (R8), Z11 VMOVDQU64 (R9), Z12 VMOVDQU64 (R10), Z13 VMOVDQU64 (R11), Z14 VMOVDQU64 (DX), Z15 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (CX), Z16 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 VXORPD Z8, Z17, Z8 VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 VXORPD Z9, Z17, Z9 VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 VXORPD Z10, Z17, Z10 VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 VXORPD Z11, Z17, Z11 VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 VXORPD Z12, Z17, Z12 VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 VXORPD Z13, Z17, Z13 VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 VXORPD Z15, Z17, Z15 // Store 8 outputs VMOVDQU64 Z8, (BX) ADDQ $0x40, BX VMOVDQU64 Z9, (SI) ADDQ $0x40, SI VMOVDQU64 Z10, (DI) ADDQ $0x40, DI VMOVDQU64 Z11, (R8) ADDQ $0x40, R8 VMOVDQU64 Z12, (R9) ADDQ $0x40, R9 VMOVDQU64 Z13, (R10) ADDQ $0x40, R10 VMOVDQU64 Z14, (R11) ADDQ $0x40, R11 VMOVDQU64 Z15, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x8_64Xor_loop VZEROUPPER mulGFNI_1x8_64Xor_end: RET // func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88 // Loading 6 of 8 tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), BX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, BX // Add start offset to input ADDQ R13, DX mulAvxGFNI_1x8Xor_loop: // Load 8 outputs VMOVDQU (SI), Y6 VMOVDQU (DI), Y7 VMOVDQU (R8), Y8 VMOVDQU (R9), Y9 VMOVDQU (R10), Y10 VMOVDQU (R11), Y11 VMOVDQU (R12), Y12 VMOVDQU (BX), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI VMOVDQU Y7, (DI) ADDQ $0x20, DI VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (R9) ADDQ $0x20, R9 VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x8Xor_loop VZEROUPPER mulAvxGFNI_1x8Xor_end: RET // func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), BX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, BX // Add start offset to input ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X8 VPBROADCASTB X8, Y8 mulAvxTwo_1x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (SI), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (BX), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x8Xor_loop VZEROUPPER mulAvxTwo_1x8Xor_end: RET // func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x9_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), BX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, BX // Add start offset to input ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X9 VPBROADCASTB X9, Y9 mulAvxTwo_1x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y9, Y11, Y11 VPAND Y9, Y12, Y12 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y6 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y7 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 VPXOR Y8, Y10, Y8 // Store 9 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (R13) ADDQ $0x20, R13 VMOVDQU Y8, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x9_loop VZEROUPPER mulAvxTwo_1x9_end: RET // func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DX // Add start offset to input ADDQ R13, CX mulGFNI_1x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (CX), Z17 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z17, Z9 VGF2P8AFFINEQB $0x00, Z1, Z17, Z10 VGF2P8AFFINEQB $0x00, Z2, Z17, Z11 VGF2P8AFFINEQB $0x00, Z3, Z17, Z12 VGF2P8AFFINEQB $0x00, Z4, Z17, Z13 VGF2P8AFFINEQB $0x00, Z5, Z17, Z14 VGF2P8AFFINEQB $0x00, Z6, Z17, Z15 VGF2P8AFFINEQB $0x00, Z7, Z17, Z16 VGF2P8AFFINEQB $0x00, Z8, Z17, Z17 // Store 9 outputs VMOVDQU64 Z9, (BX) ADDQ $0x40, BX VMOVDQU64 Z10, (SI) ADDQ $0x40, SI VMOVDQU64 Z11, (DI) ADDQ $0x40, DI VMOVDQU64 Z12, (R8) ADDQ $0x40, R8 VMOVDQU64 Z13, (R9) ADDQ $0x40, R9 VMOVDQU64 Z14, (R10) ADDQ $0x40, R10 VMOVDQU64 Z15, (R11) ADDQ $0x40, R11 VMOVDQU64 Z16, (R12) ADDQ $0x40, R12 VMOVDQU64 Z17, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x9_64_loop VZEROUPPER mulGFNI_1x9_64_end: RET // func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x9(SB), $0-88 // Loading 5 of 9 tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), BX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, BX // Add start offset to input ADDQ R14, DX mulAvxGFNI_1x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 VBROADCASTSD 64(CX), Y14 VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 // Store 9 outputs VMOVDQU Y5, (SI) ADDQ $0x20, SI VMOVDQU Y6, (DI) ADDQ $0x20, DI VMOVDQU Y7, (R8) ADDQ $0x20, R8 VMOVDQU Y8, (R9) ADDQ $0x20, R9 VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x9_loop VZEROUPPER mulAvxGFNI_1x9_end: RET // func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DX // Add start offset to input ADDQ R13, CX mulGFNI_1x9_64Xor_loop: // Load 9 outputs VMOVDQU64 (BX), Z9 VMOVDQU64 (SI), Z10 VMOVDQU64 (DI), Z11 VMOVDQU64 (R8), Z12 VMOVDQU64 (R9), Z13 VMOVDQU64 (R10), Z14 VMOVDQU64 (R11), Z15 VMOVDQU64 (R12), Z16 VMOVDQU64 (DX), Z17 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (CX), Z18 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 VXORPD Z9, Z19, Z9 VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 VXORPD Z10, Z19, Z10 VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 VXORPD Z11, Z19, Z11 VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 VXORPD Z12, Z19, Z12 VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 VXORPD Z13, Z19, Z13 VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 VXORPD Z14, Z19, Z14 VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 VXORPD Z17, Z19, Z17 // Store 9 outputs VMOVDQU64 Z9, (BX) ADDQ $0x40, BX VMOVDQU64 Z10, (SI) ADDQ $0x40, SI VMOVDQU64 Z11, (DI) ADDQ $0x40, DI VMOVDQU64 Z12, (R8) ADDQ $0x40, R8 VMOVDQU64 Z13, (R9) ADDQ $0x40, R9 VMOVDQU64 Z14, (R10) ADDQ $0x40, R10 VMOVDQU64 Z15, (R11) ADDQ $0x40, R11 VMOVDQU64 Z16, (R12) ADDQ $0x40, R12 VMOVDQU64 Z17, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x9_64Xor_loop VZEROUPPER mulGFNI_1x9_64Xor_end: RET // func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88 // Loading 5 of 9 tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), BX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, BX // Add start offset to input ADDQ R14, DX mulAvxGFNI_1x9Xor_loop: // Load 9 outputs VMOVDQU (SI), Y5 VMOVDQU (DI), Y6 VMOVDQU (R8), Y7 VMOVDQU (R9), Y8 VMOVDQU (R10), Y9 VMOVDQU (R11), Y10 VMOVDQU (R12), Y11 VMOVDQU (R13), Y12 VMOVDQU (BX), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs VMOVDQU Y5, (SI) ADDQ $0x20, SI VMOVDQU Y6, (DI) ADDQ $0x20, DI VMOVDQU Y7, (R8) ADDQ $0x20, R8 VMOVDQU Y8, (R9) ADDQ $0x20, R9 VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x9Xor_loop VZEROUPPER mulAvxGFNI_1x9Xor_end: RET // func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), BX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, BX // Add start offset to input ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X9 VPBROADCASTB X9, Y9 mulAvxTwo_1x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (SI), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (BX), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (R13) ADDQ $0x20, R13 VMOVDQU Y8, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x9Xor_loop VZEROUPPER mulAvxTwo_1x9Xor_end: RET // func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x10_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), R14 MOVQ 216(BX), BX MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, BX // Add start offset to input ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X10 VPBROADCASTB X10, Y10 mulAvxTwo_1x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y10, Y12, Y12 VPAND Y10, Y13, Y13 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y7 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y8 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y9, Y9 VPSHUFB Y13, Y11, Y11 VPXOR Y9, Y11, Y9 // Store 10 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (R13) ADDQ $0x20, R13 VMOVDQU Y8, (R14) ADDQ $0x20, R14 VMOVDQU Y9, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x10_loop VZEROUPPER mulAvxTwo_1x10_end: RET // func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x10_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, DX // Add start offset to input ADDQ R14, CX mulGFNI_1x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (CX), Z19 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 // Store 10 outputs VMOVDQU64 Z10, (BX) ADDQ $0x40, BX VMOVDQU64 Z11, (SI) ADDQ $0x40, SI VMOVDQU64 Z12, (DI) ADDQ $0x40, DI VMOVDQU64 Z13, (R8) ADDQ $0x40, R8 VMOVDQU64 Z14, (R9) ADDQ $0x40, R9 VMOVDQU64 Z15, (R10) ADDQ $0x40, R10 VMOVDQU64 Z16, (R11) ADDQ $0x40, R11 VMOVDQU64 Z17, (R12) ADDQ $0x40, R12 VMOVDQU64 Z18, (R13) ADDQ $0x40, R13 VMOVDQU64 Z19, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x10_64_loop VZEROUPPER mulGFNI_1x10_64_end: RET // func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x10(SB), $0-88 // Loading 4 of 10 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), R14 MOVQ 216(BX), BX MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, BX // Add start offset to input ADDQ R15, DX mulAvxGFNI_1x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 VBROADCASTSD 72(CX), Y14 VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 // Store 10 outputs VMOVDQU Y4, (SI) ADDQ $0x20, SI VMOVDQU Y5, (DI) ADDQ $0x20, DI VMOVDQU Y6, (R8) ADDQ $0x20, R8 VMOVDQU Y7, (R9) ADDQ $0x20, R9 VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x10_loop VZEROUPPER mulAvxGFNI_1x10_end: RET // func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_1x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, DX // Add start offset to input ADDQ R14, CX mulGFNI_1x10_64Xor_loop: // Load 10 outputs VMOVDQU64 (BX), Z10 VMOVDQU64 (SI), Z11 VMOVDQU64 (DI), Z12 VMOVDQU64 (R8), Z13 VMOVDQU64 (R9), Z14 VMOVDQU64 (R10), Z15 VMOVDQU64 (R11), Z16 VMOVDQU64 (R12), Z17 VMOVDQU64 (R13), Z18 VMOVDQU64 (DX), Z19 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (CX), Z20 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 VXORPD Z10, Z21, Z10 VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 VXORPD Z11, Z21, Z11 VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 VXORPD Z12, Z21, Z12 VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 VXORPD Z13, Z21, Z13 VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 VXORPD Z14, Z21, Z14 VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 VXORPD Z15, Z21, Z15 VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 VXORPD Z19, Z21, Z19 // Store 10 outputs VMOVDQU64 Z10, (BX) ADDQ $0x40, BX VMOVDQU64 Z11, (SI) ADDQ $0x40, SI VMOVDQU64 Z12, (DI) ADDQ $0x40, DI VMOVDQU64 Z13, (R8) ADDQ $0x40, R8 VMOVDQU64 Z14, (R9) ADDQ $0x40, R9 VMOVDQU64 Z15, (R10) ADDQ $0x40, R10 VMOVDQU64 Z16, (R11) ADDQ $0x40, R11 VMOVDQU64 Z17, (R12) ADDQ $0x40, R12 VMOVDQU64 Z18, (R13) ADDQ $0x40, R13 VMOVDQU64 Z19, (DX) ADDQ $0x40, DX // Prepare for next loop DECQ AX JNZ mulGFNI_1x10_64Xor_loop VZEROUPPER mulGFNI_1x10_64Xor_end: RET // func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88 // Loading 4 of 10 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_1x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), R14 MOVQ 216(BX), BX MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, BX // Add start offset to input ADDQ R15, DX mulAvxGFNI_1x10Xor_loop: // Load 10 outputs VMOVDQU (SI), Y4 VMOVDQU (DI), Y5 VMOVDQU (R8), Y6 VMOVDQU (R9), Y7 VMOVDQU (R10), Y8 VMOVDQU (R11), Y9 VMOVDQU (R12), Y10 VMOVDQU (R13), Y11 VMOVDQU (R14), Y12 VMOVDQU (BX), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs VMOVDQU Y4, (SI) ADDQ $0x20, SI VMOVDQU Y5, (DI) ADDQ $0x20, DI VMOVDQU Y6, (R8) ADDQ $0x20, R8 VMOVDQU Y7, (R9) ADDQ $0x20, R9 VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_1x10Xor_loop VZEROUPPER mulAvxGFNI_1x10Xor_end: RET // func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), R14 MOVQ 216(BX), BX MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, BX // Add start offset to input ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X10 VPBROADCASTB X10, Y10 mulAvxTwo_1x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (SI), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU (R14), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU (BX), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (R13) ADDQ $0x20, R13 VMOVDQU Y8, (R14) ADDQ $0x20, R14 VMOVDQU Y9, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x10Xor_loop VZEROUPPER mulAvxTwo_1x10Xor_end: RET // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x1_64_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input ADDQ SI, DX ADDQ SI, CX MOVQ $0x0000000f, SI MOVQ SI, X6 VPBROADCASTB X6, Y6 mulAvxTwo_2x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y9 ADDQ $0x40, DX VPSRLQ $0x04, Y7, Y8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y7, Y7 VPAND Y6, Y9, Y9 VPAND Y6, Y8, Y8 VPAND Y6, Y10, Y10 VPSHUFB Y7, Y0, Y7 VPSHUFB Y9, Y0, Y9 VPSHUFB Y8, Y1, Y8 VPSHUFB Y10, Y1, Y10 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y9 ADDQ $0x40, CX VPSRLQ $0x04, Y7, Y8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y7, Y7 VPAND Y6, Y9, Y9 VPAND Y6, Y8, Y8 VPAND Y6, Y10, Y10 VPSHUFB Y7, Y2, Y7 VPSHUFB Y9, Y2, Y9 VPSHUFB Y8, Y3, Y8 VPSHUFB Y10, Y3, Y10 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 1 outputs VMOVDQU Y4, (BX) VMOVDQU Y5, 32(BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x1_64_loop VZEROUPPER mulAvxTwo_2x1_64_end: RET // func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 5 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input ADDQ SI, DX ADDQ SI, CX mulGFNI_2x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z3 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (CX), Z3 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 VXORPD Z2, Z3, Z2 // Store 1 outputs VMOVDQU64 Z2, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x1_64_loop VZEROUPPER mulGFNI_2x1_64_end: RET // func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 5 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input ADDQ SI, DX ADDQ SI, CX mulAvxGFNI_2x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y3 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (CX), Y3 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 VXORPD Y2, Y3, Y2 // Store 1 outputs VMOVDQU Y2, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x1_loop VZEROUPPER mulAvxGFNI_2x1_end: RET // func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 5 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input ADDQ SI, DX ADDQ SI, CX mulGFNI_2x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (BX), Z2 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z3 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z3, Z3 VXORPD Z2, Z3, Z2 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (CX), Z3 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 VXORPD Z2, Z3, Z2 // Store 1 outputs VMOVDQU64 Z2, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x1_64Xor_loop VZEROUPPER mulGFNI_2x1_64Xor_end: RET // func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 5 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input ADDQ SI, DX ADDQ SI, CX mulAvxGFNI_2x1Xor_loop: // Load 1 outputs VMOVDQU (BX), Y2 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y3 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 VXORPD Y2, Y3, Y2 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (CX), Y3 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 VXORPD Y2, Y3, Y2 // Store 1 outputs VMOVDQU Y2, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x1Xor_loop VZEROUPPER mulAvxGFNI_2x1Xor_end: RET // func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x1_64Xor_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input ADDQ SI, DX ADDQ SI, CX MOVQ $0x0000000f, SI MOVQ SI, X6 VPBROADCASTB X6, Y6 mulAvxTwo_2x1_64Xor_loop: // Load 1 outputs VMOVDQU (BX), Y4 VMOVDQU 32(BX), Y5 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y9 ADDQ $0x40, DX VPSRLQ $0x04, Y7, Y8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y7, Y7 VPAND Y6, Y9, Y9 VPAND Y6, Y8, Y8 VPAND Y6, Y10, Y10 VPSHUFB Y7, Y0, Y7 VPSHUFB Y9, Y0, Y9 VPSHUFB Y8, Y1, Y8 VPSHUFB Y10, Y1, Y10 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y9 ADDQ $0x40, CX VPSRLQ $0x04, Y7, Y8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y7, Y7 VPAND Y6, Y9, Y9 VPAND Y6, Y8, Y8 VPAND Y6, Y10, Y10 VPSHUFB Y7, Y2, Y7 VPSHUFB Y9, Y2, Y9 VPSHUFB Y8, Y3, Y8 VPSHUFB Y10, Y3, Y10 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 1 outputs VMOVDQU Y4, (BX) VMOVDQU Y5, 32(BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x1_64Xor_loop VZEROUPPER mulAvxTwo_2x1_64Xor_end: RET // func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 25 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI ADDQ R8, SI // Add start offset to input ADDQ R8, BX ADDQ R8, DX MOVQ $0x0000000f, R8 MOVQ R8, X4 VPBROADCASTB X4, Y4 mulAvxTwo_2x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) ADDQ $0x40, DI VMOVDQU Y2, (SI) VMOVDQU Y3, 32(SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x2_64_loop VZEROUPPER mulAvxTwo_2x2_64_end: RET // func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input ADDQ DI, DX ADDQ DI, CX mulGFNI_2x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z6 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z6, Z4 VGF2P8AFFINEQB $0x00, Z1, Z6, Z5 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (CX), Z6 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 VXORPD Z4, Z7, Z4 VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 VXORPD Z5, Z7, Z5 // Store 2 outputs VMOVDQU64 Z4, (SI) ADDQ $0x40, SI VMOVDQU64 Z5, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x2_64_loop VZEROUPPER mulGFNI_2x2_64_end: RET // func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x2(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input ADDQ DI, DX ADDQ DI, CX mulAvxGFNI_2x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (CX), Y6 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 VXORPD Y4, Y7, Y4 VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 VXORPD Y5, Y7, Y5 // Store 2 outputs VMOVDQU Y4, (SI) ADDQ $0x20, SI VMOVDQU Y5, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x2_loop VZEROUPPER mulAvxGFNI_2x2_end: RET // func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input ADDQ DI, DX ADDQ DI, CX mulGFNI_2x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (SI), Z4 VMOVDQU64 (BX), Z5 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z6 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 VXORPD Z4, Z7, Z4 VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 VXORPD Z5, Z7, Z5 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (CX), Z6 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 VXORPD Z4, Z7, Z4 VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 VXORPD Z5, Z7, Z5 // Store 2 outputs VMOVDQU64 Z4, (SI) ADDQ $0x40, SI VMOVDQU64 Z5, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x2_64Xor_loop VZEROUPPER mulGFNI_2x2_64Xor_end: RET // func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input ADDQ DI, DX ADDQ DI, CX mulAvxGFNI_2x2Xor_loop: // Load 2 outputs VMOVDQU (SI), Y4 VMOVDQU (BX), Y5 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 VXORPD Y4, Y7, Y4 VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 VXORPD Y5, Y7, Y5 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (CX), Y6 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 VXORPD Y4, Y7, Y4 VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 VXORPD Y5, Y7, Y5 // Store 2 outputs VMOVDQU Y4, (SI) ADDQ $0x20, SI VMOVDQU Y5, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x2Xor_loop VZEROUPPER mulAvxGFNI_2x2Xor_end: RET // func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 25 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI ADDQ R8, SI // Add start offset to input ADDQ R8, BX ADDQ R8, DX MOVQ $0x0000000f, R8 MOVQ R8, X4 VPBROADCASTB X4, Y4 mulAvxTwo_2x2_64Xor_loop: // Load 2 outputs VMOVDQU (DI), Y0 VMOVDQU 32(DI), Y1 VMOVDQU (SI), Y2 VMOVDQU 32(SI), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) ADDQ $0x40, DI VMOVDQU Y2, (SI) VMOVDQU Y3, 32(SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x2_64Xor_loop VZEROUPPER mulAvxTwo_2x2_64Xor_end: RET // func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, DI ADDQ R9, R8 ADDQ R9, SI // Add start offset to input ADDQ R9, BX ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X6 VPBROADCASTB X6, Y6 mulAvxTwo_2x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) ADDQ $0x40, DI VMOVDQU Y2, (R8) VMOVDQU Y3, 32(R8) ADDQ $0x40, R8 VMOVDQU Y4, (SI) VMOVDQU Y5, 32(SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x3_64_loop VZEROUPPER mulAvxTwo_2x3_64_end: RET // func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, SI ADDQ R8, DI ADDQ R8, BX // Add start offset to input ADDQ R8, DX ADDQ R8, CX mulGFNI_2x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z9 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z9, Z6 VGF2P8AFFINEQB $0x00, Z1, Z9, Z7 VGF2P8AFFINEQB $0x00, Z2, Z9, Z8 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (CX), Z9 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 VXORPD Z6, Z10, Z6 VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 VXORPD Z7, Z10, Z7 VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 VXORPD Z8, Z10, Z8 // Store 3 outputs VMOVDQU64 Z6, (SI) ADDQ $0x40, SI VMOVDQU64 Z7, (DI) ADDQ $0x40, DI VMOVDQU64 Z8, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x3_64_loop VZEROUPPER mulGFNI_2x3_64_end: RET // func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x3(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, SI ADDQ R8, DI ADDQ R8, BX // Add start offset to input ADDQ R8, DX ADDQ R8, CX mulAvxGFNI_2x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (CX), Y9 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 VXORPD Y6, Y10, Y6 VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 VXORPD Y7, Y10, Y7 VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 VXORPD Y8, Y10, Y8 // Store 3 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI VMOVDQU Y7, (DI) ADDQ $0x20, DI VMOVDQU Y8, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x3_loop VZEROUPPER mulAvxGFNI_2x3_end: RET // func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, SI ADDQ R8, DI ADDQ R8, BX // Add start offset to input ADDQ R8, DX ADDQ R8, CX mulGFNI_2x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (SI), Z6 VMOVDQU64 (DI), Z7 VMOVDQU64 (BX), Z8 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z9 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z9, Z10 VXORPD Z6, Z10, Z6 VGF2P8AFFINEQB $0x00, Z1, Z9, Z10 VXORPD Z7, Z10, Z7 VGF2P8AFFINEQB $0x00, Z2, Z9, Z10 VXORPD Z8, Z10, Z8 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (CX), Z9 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 VXORPD Z6, Z10, Z6 VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 VXORPD Z7, Z10, Z7 VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 VXORPD Z8, Z10, Z8 // Store 3 outputs VMOVDQU64 Z6, (SI) ADDQ $0x40, SI VMOVDQU64 Z7, (DI) ADDQ $0x40, DI VMOVDQU64 Z8, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x3_64Xor_loop VZEROUPPER mulGFNI_2x3_64Xor_end: RET // func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, SI ADDQ R8, DI ADDQ R8, BX // Add start offset to input ADDQ R8, DX ADDQ R8, CX mulAvxGFNI_2x3Xor_loop: // Load 3 outputs VMOVDQU (SI), Y6 VMOVDQU (DI), Y7 VMOVDQU (BX), Y8 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 VXORPD Y6, Y10, Y6 VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 VXORPD Y7, Y10, Y7 VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 VXORPD Y8, Y10, Y8 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (CX), Y9 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 VXORPD Y6, Y10, Y6 VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 VXORPD Y7, Y10, Y7 VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 VXORPD Y8, Y10, Y8 // Store 3 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI VMOVDQU Y7, (DI) ADDQ $0x20, DI VMOVDQU Y8, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x3Xor_loop VZEROUPPER mulAvxGFNI_2x3Xor_end: RET // func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, DI ADDQ R9, R8 ADDQ R9, SI // Add start offset to input ADDQ R9, BX ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X6 VPBROADCASTB X6, Y6 mulAvxTwo_2x3_64Xor_loop: // Load 3 outputs VMOVDQU (DI), Y0 VMOVDQU 32(DI), Y1 VMOVDQU (R8), Y2 VMOVDQU 32(R8), Y3 VMOVDQU (SI), Y4 VMOVDQU 32(SI), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) ADDQ $0x40, DI VMOVDQU Y2, (R8) VMOVDQU Y3, 32(R8) ADDQ $0x40, R8 VMOVDQU Y4, (SI) VMOVDQU Y5, 32(SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x3_64Xor_loop VZEROUPPER mulAvxTwo_2x3_64Xor_end: RET // func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 25 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), SI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, SI // Add start offset to input ADDQ R10, BX ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X4 VPBROADCASTB X4, Y4 mulAvxTwo_2x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x4_loop VZEROUPPER mulAvxTwo_2x4_end: RET // func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), BX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, BX // Add start offset to input ADDQ R9, DX ADDQ R9, CX mulGFNI_2x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z12 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z12, Z8 VGF2P8AFFINEQB $0x00, Z1, Z12, Z9 VGF2P8AFFINEQB $0x00, Z2, Z12, Z10 VGF2P8AFFINEQB $0x00, Z3, Z12, Z11 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (CX), Z12 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 VXORPD Z8, Z13, Z8 VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 VXORPD Z11, Z13, Z11 // Store 4 outputs VMOVDQU64 Z8, (SI) ADDQ $0x40, SI VMOVDQU64 Z9, (DI) ADDQ $0x40, DI VMOVDQU64 Z10, (R8) ADDQ $0x40, R8 VMOVDQU64 Z11, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x4_64_loop VZEROUPPER mulGFNI_2x4_64_end: RET // func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x4(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), BX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, BX // Add start offset to input ADDQ R9, DX ADDQ R9, CX mulAvxGFNI_2x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 VXORPD Y8, Y13, Y8 VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 VXORPD Y11, Y13, Y11 // Store 4 outputs VMOVDQU Y8, (SI) ADDQ $0x20, SI VMOVDQU Y9, (DI) ADDQ $0x20, DI VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x4_loop VZEROUPPER mulAvxGFNI_2x4_end: RET // func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), BX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, BX // Add start offset to input ADDQ R9, DX ADDQ R9, CX mulGFNI_2x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (SI), Z8 VMOVDQU64 (DI), Z9 VMOVDQU64 (R8), Z10 VMOVDQU64 (BX), Z11 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z12 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 VXORPD Z8, Z13, Z8 VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (CX), Z12 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 VXORPD Z8, Z13, Z8 VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 VXORPD Z11, Z13, Z11 // Store 4 outputs VMOVDQU64 Z8, (SI) ADDQ $0x40, SI VMOVDQU64 Z9, (DI) ADDQ $0x40, DI VMOVDQU64 Z10, (R8) ADDQ $0x40, R8 VMOVDQU64 Z11, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x4_64Xor_loop VZEROUPPER mulGFNI_2x4_64Xor_end: RET // func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), BX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, BX // Add start offset to input ADDQ R9, DX ADDQ R9, CX mulAvxGFNI_2x4Xor_loop: // Load 4 outputs VMOVDQU (SI), Y8 VMOVDQU (DI), Y9 VMOVDQU (R8), Y10 VMOVDQU (BX), Y11 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 VXORPD Y8, Y13, Y8 VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 VXORPD Y8, Y13, Y8 VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 VXORPD Y11, Y13, Y11 // Store 4 outputs VMOVDQU Y8, (SI) ADDQ $0x20, SI VMOVDQU Y9, (DI) ADDQ $0x20, DI VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x4Xor_loop VZEROUPPER mulAvxGFNI_2x4Xor_end: RET // func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 25 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), SI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, SI // Add start offset to input ADDQ R10, BX ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X4 VPBROADCASTB X4, Y4 mulAvxTwo_2x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (DI), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (SI), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x4Xor_loop VZEROUPPER mulAvxTwo_2x4Xor_end: RET // func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), SI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, SI // Add start offset to input ADDQ R11, BX ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X5 VPBROADCASTB X5, Y5 mulAvxTwo_2x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x5_loop VZEROUPPER mulAvxTwo_2x5_end: RET // func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), BX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, BX // Add start offset to input ADDQ R10, DX ADDQ R10, CX mulGFNI_2x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z15 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z15, Z10 VGF2P8AFFINEQB $0x00, Z1, Z15, Z11 VGF2P8AFFINEQB $0x00, Z2, Z15, Z12 VGF2P8AFFINEQB $0x00, Z3, Z15, Z13 VGF2P8AFFINEQB $0x00, Z4, Z15, Z14 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (CX), Z15 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 VXORPD Z10, Z16, Z10 VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 VXORPD Z11, Z16, Z11 VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 VXORPD Z14, Z16, Z14 // Store 5 outputs VMOVDQU64 Z10, (SI) ADDQ $0x40, SI VMOVDQU64 Z11, (DI) ADDQ $0x40, DI VMOVDQU64 Z12, (R8) ADDQ $0x40, R8 VMOVDQU64 Z13, (R9) ADDQ $0x40, R9 VMOVDQU64 Z14, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x5_64_loop VZEROUPPER mulGFNI_2x5_64_end: RET // func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x5(SB), $0-88 // Loading 9 of 10 tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), SI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, SI // Add start offset to input ADDQ R11, BX ADDQ R11, DX mulAvxGFNI_2x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (DI) ADDQ $0x20, DI VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (R9) ADDQ $0x20, R9 VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x5_loop VZEROUPPER mulAvxGFNI_2x5_end: RET // func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), BX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, BX // Add start offset to input ADDQ R10, DX ADDQ R10, CX mulGFNI_2x5_64Xor_loop: // Load 5 outputs VMOVDQU64 (SI), Z10 VMOVDQU64 (DI), Z11 VMOVDQU64 (R8), Z12 VMOVDQU64 (R9), Z13 VMOVDQU64 (BX), Z14 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z15 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 VXORPD Z10, Z16, Z10 VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 VXORPD Z11, Z16, Z11 VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 VXORPD Z14, Z16, Z14 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (CX), Z15 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 VXORPD Z10, Z16, Z10 VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 VXORPD Z11, Z16, Z11 VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 VXORPD Z14, Z16, Z14 // Store 5 outputs VMOVDQU64 Z10, (SI) ADDQ $0x40, SI VMOVDQU64 Z11, (DI) ADDQ $0x40, DI VMOVDQU64 Z12, (R8) ADDQ $0x40, R8 VMOVDQU64 Z13, (R9) ADDQ $0x40, R9 VMOVDQU64 Z14, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x5_64Xor_loop VZEROUPPER mulGFNI_2x5_64Xor_end: RET // func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88 // Loading 9 of 10 tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), SI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, SI // Add start offset to input ADDQ R11, BX ADDQ R11, DX mulAvxGFNI_2x5Xor_loop: // Load 5 outputs VMOVDQU (DI), Y9 VMOVDQU (R8), Y10 VMOVDQU (R9), Y11 VMOVDQU (R10), Y12 VMOVDQU (SI), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (DI) ADDQ $0x20, DI VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (R9) ADDQ $0x20, R9 VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x5Xor_loop VZEROUPPER mulAvxGFNI_2x5Xor_end: RET // func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), SI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, SI // Add start offset to input ADDQ R11, BX ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X5 VPBROADCASTB X5, Y5 mulAvxTwo_2x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (DI), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (SI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x5Xor_loop VZEROUPPER mulAvxTwo_2x5Xor_end: RET // func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), SI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, SI // Add start offset to input ADDQ R12, BX ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X6 VPBROADCASTB X6, Y6 mulAvxTwo_2x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x6_loop VZEROUPPER mulAvxTwo_2x6_end: RET // func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), BX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, BX // Add start offset to input ADDQ R11, DX ADDQ R11, CX mulGFNI_2x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z18 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z18, Z12 VGF2P8AFFINEQB $0x00, Z1, Z18, Z13 VGF2P8AFFINEQB $0x00, Z2, Z18, Z14 VGF2P8AFFINEQB $0x00, Z3, Z18, Z15 VGF2P8AFFINEQB $0x00, Z4, Z18, Z16 VGF2P8AFFINEQB $0x00, Z5, Z18, Z17 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (CX), Z18 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 VXORPD Z12, Z19, Z12 VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 VXORPD Z13, Z19, Z13 VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 VXORPD Z14, Z19, Z14 VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 VXORPD Z17, Z19, Z17 // Store 6 outputs VMOVDQU64 Z12, (SI) ADDQ $0x40, SI VMOVDQU64 Z13, (DI) ADDQ $0x40, DI VMOVDQU64 Z14, (R8) ADDQ $0x40, R8 VMOVDQU64 Z15, (R9) ADDQ $0x40, R9 VMOVDQU64 Z16, (R10) ADDQ $0x40, R10 VMOVDQU64 Z17, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x6_64_loop VZEROUPPER mulGFNI_2x6_64_end: RET // func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x6(SB), $0-88 // Loading 8 of 12 tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), SI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, SI // Add start offset to input ADDQ R12, BX ADDQ R12, DX mulAvxGFNI_2x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (DI) ADDQ $0x20, DI VMOVDQU Y9, (R8) ADDQ $0x20, R8 VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (R10) ADDQ $0x20, R10 VMOVDQU Y12, (R11) ADDQ $0x20, R11 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x6_loop VZEROUPPER mulAvxGFNI_2x6_end: RET // func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), BX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, BX // Add start offset to input ADDQ R11, DX ADDQ R11, CX mulGFNI_2x6_64Xor_loop: // Load 6 outputs VMOVDQU64 (SI), Z12 VMOVDQU64 (DI), Z13 VMOVDQU64 (R8), Z14 VMOVDQU64 (R9), Z15 VMOVDQU64 (R10), Z16 VMOVDQU64 (BX), Z17 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z18 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 VXORPD Z12, Z19, Z12 VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 VXORPD Z13, Z19, Z13 VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 VXORPD Z14, Z19, Z14 VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (CX), Z18 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 VXORPD Z12, Z19, Z12 VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 VXORPD Z13, Z19, Z13 VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 VXORPD Z14, Z19, Z14 VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 VXORPD Z17, Z19, Z17 // Store 6 outputs VMOVDQU64 Z12, (SI) ADDQ $0x40, SI VMOVDQU64 Z13, (DI) ADDQ $0x40, DI VMOVDQU64 Z14, (R8) ADDQ $0x40, R8 VMOVDQU64 Z15, (R9) ADDQ $0x40, R9 VMOVDQU64 Z16, (R10) ADDQ $0x40, R10 VMOVDQU64 Z17, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x6_64Xor_loop VZEROUPPER mulGFNI_2x6_64Xor_end: RET // func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88 // Loading 8 of 12 tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), SI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, SI // Add start offset to input ADDQ R12, BX ADDQ R12, DX mulAvxGFNI_2x6Xor_loop: // Load 6 outputs VMOVDQU (DI), Y8 VMOVDQU (R8), Y9 VMOVDQU (R9), Y10 VMOVDQU (R10), Y11 VMOVDQU (R11), Y12 VMOVDQU (SI), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (DI) ADDQ $0x20, DI VMOVDQU Y9, (R8) ADDQ $0x20, R8 VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (R10) ADDQ $0x20, R10 VMOVDQU Y12, (R11) ADDQ $0x20, R11 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x6Xor_loop VZEROUPPER mulAvxGFNI_2x6Xor_end: RET // func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), SI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, SI // Add start offset to input ADDQ R12, BX ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X6 VPBROADCASTB X6, Y6 mulAvxTwo_2x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (DI), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (SI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x6Xor_loop VZEROUPPER mulAvxTwo_2x6Xor_end: RET // func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 40 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), SI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, SI // Add start offset to input ADDQ R13, BX ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X7 VPBROADCASTB X7, Y7 mulAvxTwo_2x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x7_loop VZEROUPPER mulAvxTwo_2x7_end: RET // func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), BX MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, BX // Add start offset to input ADDQ R12, DX ADDQ R12, CX mulGFNI_2x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (DX), Z21 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z21, Z14 VGF2P8AFFINEQB $0x00, Z1, Z21, Z15 VGF2P8AFFINEQB $0x00, Z2, Z21, Z16 VGF2P8AFFINEQB $0x00, Z3, Z21, Z17 VGF2P8AFFINEQB $0x00, Z4, Z21, Z18 VGF2P8AFFINEQB $0x00, Z5, Z21, Z19 VGF2P8AFFINEQB $0x00, Z6, Z21, Z20 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (CX), Z21 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 VXORPD Z14, Z22, Z14 VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 VXORPD Z15, Z22, Z15 VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 VXORPD Z16, Z22, Z16 VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 VXORPD Z17, Z22, Z17 VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 VXORPD Z20, Z22, Z20 // Store 7 outputs VMOVDQU64 Z14, (SI) ADDQ $0x40, SI VMOVDQU64 Z15, (DI) ADDQ $0x40, DI VMOVDQU64 Z16, (R8) ADDQ $0x40, R8 VMOVDQU64 Z17, (R9) ADDQ $0x40, R9 VMOVDQU64 Z18, (R10) ADDQ $0x40, R10 VMOVDQU64 Z19, (R11) ADDQ $0x40, R11 VMOVDQU64 Z20, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x7_64_loop VZEROUPPER mulGFNI_2x7_64_end: RET // func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x7(SB), $0-88 // Loading 7 of 14 tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), SI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, SI // Add start offset to input ADDQ R13, BX ADDQ R13, DX mulAvxGFNI_2x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (DI) ADDQ $0x20, DI VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (R9) ADDQ $0x20, R9 VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x7_loop VZEROUPPER mulAvxGFNI_2x7_end: RET // func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), BX MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, BX // Add start offset to input ADDQ R12, DX ADDQ R12, CX mulGFNI_2x7_64Xor_loop: // Load 7 outputs VMOVDQU64 (SI), Z14 VMOVDQU64 (DI), Z15 VMOVDQU64 (R8), Z16 VMOVDQU64 (R9), Z17 VMOVDQU64 (R10), Z18 VMOVDQU64 (R11), Z19 VMOVDQU64 (BX), Z20 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (DX), Z21 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 VXORPD Z14, Z22, Z14 VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 VXORPD Z15, Z22, Z15 VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 VXORPD Z16, Z22, Z16 VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 VXORPD Z17, Z22, Z17 VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (CX), Z21 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 VXORPD Z14, Z22, Z14 VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 VXORPD Z15, Z22, Z15 VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 VXORPD Z16, Z22, Z16 VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 VXORPD Z17, Z22, Z17 VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 VXORPD Z20, Z22, Z20 // Store 7 outputs VMOVDQU64 Z14, (SI) ADDQ $0x40, SI VMOVDQU64 Z15, (DI) ADDQ $0x40, DI VMOVDQU64 Z16, (R8) ADDQ $0x40, R8 VMOVDQU64 Z17, (R9) ADDQ $0x40, R9 VMOVDQU64 Z18, (R10) ADDQ $0x40, R10 VMOVDQU64 Z19, (R11) ADDQ $0x40, R11 VMOVDQU64 Z20, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x7_64Xor_loop VZEROUPPER mulGFNI_2x7_64Xor_end: RET // func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88 // Loading 7 of 14 tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), SI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, SI // Add start offset to input ADDQ R13, BX ADDQ R13, DX mulAvxGFNI_2x7Xor_loop: // Load 7 outputs VMOVDQU (DI), Y7 VMOVDQU (R8), Y8 VMOVDQU (R9), Y9 VMOVDQU (R10), Y10 VMOVDQU (R11), Y11 VMOVDQU (R12), Y12 VMOVDQU (SI), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (DI) ADDQ $0x20, DI VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (R9) ADDQ $0x20, R9 VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x7Xor_loop VZEROUPPER mulAvxGFNI_2x7Xor_end: RET // func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 40 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), SI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, SI // Add start offset to input ADDQ R13, BX ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X7 VPBROADCASTB X7, Y7 mulAvxTwo_2x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (DI), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (SI), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x7Xor_loop VZEROUPPER mulAvxTwo_2x7Xor_end: RET // func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 45 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), SI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, SI // Add start offset to input ADDQ R14, BX ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X8 VPBROADCASTB X8, Y8 mulAvxTwo_2x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x8_loop VZEROUPPER mulAvxTwo_2x8_end: RET // func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), BX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, BX // Add start offset to input ADDQ R13, DX ADDQ R13, CX mulGFNI_2x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (DX), Z24 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z24, Z16 VGF2P8AFFINEQB $0x00, Z1, Z24, Z17 VGF2P8AFFINEQB $0x00, Z2, Z24, Z18 VGF2P8AFFINEQB $0x00, Z3, Z24, Z19 VGF2P8AFFINEQB $0x00, Z4, Z24, Z20 VGF2P8AFFINEQB $0x00, Z5, Z24, Z21 VGF2P8AFFINEQB $0x00, Z6, Z24, Z22 VGF2P8AFFINEQB $0x00, Z7, Z24, Z23 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (CX), Z24 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 VXORPD Z16, Z25, Z16 VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 VXORPD Z17, Z25, Z17 VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 VXORPD Z18, Z25, Z18 VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 VXORPD Z19, Z25, Z19 VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 VXORPD Z23, Z25, Z23 // Store 8 outputs VMOVDQU64 Z16, (SI) ADDQ $0x40, SI VMOVDQU64 Z17, (DI) ADDQ $0x40, DI VMOVDQU64 Z18, (R8) ADDQ $0x40, R8 VMOVDQU64 Z19, (R9) ADDQ $0x40, R9 VMOVDQU64 Z20, (R10) ADDQ $0x40, R10 VMOVDQU64 Z21, (R11) ADDQ $0x40, R11 VMOVDQU64 Z22, (R12) ADDQ $0x40, R12 VMOVDQU64 Z23, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x8_64_loop VZEROUPPER mulGFNI_2x8_64_end: RET // func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x8(SB), $0-88 // Loading 6 of 16 tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), SI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, SI // Add start offset to input ADDQ R14, BX ADDQ R14, DX mulAvxGFNI_2x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (DI) ADDQ $0x20, DI VMOVDQU Y7, (R8) ADDQ $0x20, R8 VMOVDQU Y8, (R9) ADDQ $0x20, R9 VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x8_loop VZEROUPPER mulAvxGFNI_2x8_end: RET // func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), BX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, BX // Add start offset to input ADDQ R13, DX ADDQ R13, CX mulGFNI_2x8_64Xor_loop: // Load 8 outputs VMOVDQU64 (SI), Z16 VMOVDQU64 (DI), Z17 VMOVDQU64 (R8), Z18 VMOVDQU64 (R9), Z19 VMOVDQU64 (R10), Z20 VMOVDQU64 (R11), Z21 VMOVDQU64 (R12), Z22 VMOVDQU64 (BX), Z23 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (DX), Z24 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 VXORPD Z16, Z25, Z16 VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 VXORPD Z17, Z25, Z17 VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 VXORPD Z18, Z25, Z18 VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 VXORPD Z19, Z25, Z19 VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (CX), Z24 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 VXORPD Z16, Z25, Z16 VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 VXORPD Z17, Z25, Z17 VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 VXORPD Z18, Z25, Z18 VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 VXORPD Z19, Z25, Z19 VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 VXORPD Z23, Z25, Z23 // Store 8 outputs VMOVDQU64 Z16, (SI) ADDQ $0x40, SI VMOVDQU64 Z17, (DI) ADDQ $0x40, DI VMOVDQU64 Z18, (R8) ADDQ $0x40, R8 VMOVDQU64 Z19, (R9) ADDQ $0x40, R9 VMOVDQU64 Z20, (R10) ADDQ $0x40, R10 VMOVDQU64 Z21, (R11) ADDQ $0x40, R11 VMOVDQU64 Z22, (R12) ADDQ $0x40, R12 VMOVDQU64 Z23, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x8_64Xor_loop VZEROUPPER mulGFNI_2x8_64Xor_end: RET // func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88 // Loading 6 of 16 tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), SI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, SI // Add start offset to input ADDQ R14, BX ADDQ R14, DX mulAvxGFNI_2x8Xor_loop: // Load 8 outputs VMOVDQU (DI), Y6 VMOVDQU (R8), Y7 VMOVDQU (R9), Y8 VMOVDQU (R10), Y9 VMOVDQU (R11), Y10 VMOVDQU (R12), Y11 VMOVDQU (R13), Y12 VMOVDQU (SI), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (DI) ADDQ $0x20, DI VMOVDQU Y7, (R8) ADDQ $0x20, R8 VMOVDQU Y8, (R9) ADDQ $0x20, R9 VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x8Xor_loop VZEROUPPER mulAvxGFNI_2x8Xor_end: RET // func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 45 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), SI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, SI // Add start offset to input ADDQ R14, BX ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X8 VPBROADCASTB X8, Y8 mulAvxTwo_2x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (DI), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (SI), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x8Xor_loop VZEROUPPER mulAvxTwo_2x8Xor_end: RET // func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), SI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, SI // Add start offset to input ADDQ R15, BX ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X9 VPBROADCASTB X9, Y9 mulAvxTwo_2x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x9_loop VZEROUPPER mulAvxTwo_2x9_end: RET // func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), BX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, BX // Add start offset to input ADDQ R14, DX ADDQ R14, CX mulGFNI_2x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (DX), Z27 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z27, Z18 VGF2P8AFFINEQB $0x00, Z1, Z27, Z19 VGF2P8AFFINEQB $0x00, Z2, Z27, Z20 VGF2P8AFFINEQB $0x00, Z3, Z27, Z21 VGF2P8AFFINEQB $0x00, Z4, Z27, Z22 VGF2P8AFFINEQB $0x00, Z5, Z27, Z23 VGF2P8AFFINEQB $0x00, Z6, Z27, Z24 VGF2P8AFFINEQB $0x00, Z7, Z27, Z25 VGF2P8AFFINEQB $0x00, Z8, Z27, Z26 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (CX), Z27 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 VXORPD Z18, Z28, Z18 VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 VXORPD Z19, Z28, Z19 VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 VXORPD Z20, Z28, Z20 VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 VXORPD Z21, Z28, Z21 VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 VXORPD Z22, Z28, Z22 VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 VXORPD Z23, Z28, Z23 VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 VXORPD Z26, Z28, Z26 // Store 9 outputs VMOVDQU64 Z18, (SI) ADDQ $0x40, SI VMOVDQU64 Z19, (DI) ADDQ $0x40, DI VMOVDQU64 Z20, (R8) ADDQ $0x40, R8 VMOVDQU64 Z21, (R9) ADDQ $0x40, R9 VMOVDQU64 Z22, (R10) ADDQ $0x40, R10 VMOVDQU64 Z23, (R11) ADDQ $0x40, R11 VMOVDQU64 Z24, (R12) ADDQ $0x40, R12 VMOVDQU64 Z25, (R13) ADDQ $0x40, R13 VMOVDQU64 Z26, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x9_64_loop VZEROUPPER mulGFNI_2x9_64_end: RET // func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x9(SB), $0-88 // Loading 5 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), SI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, SI // Add start offset to input ADDQ R15, BX ADDQ R15, DX mulAvxGFNI_2x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs VMOVDQU Y5, (DI) ADDQ $0x20, DI VMOVDQU Y6, (R8) ADDQ $0x20, R8 VMOVDQU Y7, (R9) ADDQ $0x20, R9 VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x9_loop VZEROUPPER mulAvxGFNI_2x9_end: RET // func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), BX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, BX // Add start offset to input ADDQ R14, DX ADDQ R14, CX mulGFNI_2x9_64Xor_loop: // Load 9 outputs VMOVDQU64 (SI), Z18 VMOVDQU64 (DI), Z19 VMOVDQU64 (R8), Z20 VMOVDQU64 (R9), Z21 VMOVDQU64 (R10), Z22 VMOVDQU64 (R11), Z23 VMOVDQU64 (R12), Z24 VMOVDQU64 (R13), Z25 VMOVDQU64 (BX), Z26 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (DX), Z27 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 VXORPD Z18, Z28, Z18 VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 VXORPD Z19, Z28, Z19 VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 VXORPD Z20, Z28, Z20 VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 VXORPD Z21, Z28, Z21 VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 VXORPD Z22, Z28, Z22 VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 VXORPD Z23, Z28, Z23 VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (CX), Z27 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 VXORPD Z18, Z28, Z18 VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 VXORPD Z19, Z28, Z19 VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 VXORPD Z20, Z28, Z20 VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 VXORPD Z21, Z28, Z21 VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 VXORPD Z22, Z28, Z22 VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 VXORPD Z23, Z28, Z23 VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 VXORPD Z26, Z28, Z26 // Store 9 outputs VMOVDQU64 Z18, (SI) ADDQ $0x40, SI VMOVDQU64 Z19, (DI) ADDQ $0x40, DI VMOVDQU64 Z20, (R8) ADDQ $0x40, R8 VMOVDQU64 Z21, (R9) ADDQ $0x40, R9 VMOVDQU64 Z22, (R10) ADDQ $0x40, R10 VMOVDQU64 Z23, (R11) ADDQ $0x40, R11 VMOVDQU64 Z24, (R12) ADDQ $0x40, R12 VMOVDQU64 Z25, (R13) ADDQ $0x40, R13 VMOVDQU64 Z26, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x9_64Xor_loop VZEROUPPER mulGFNI_2x9_64Xor_end: RET // func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88 // Loading 5 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), SI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, SI // Add start offset to input ADDQ R15, BX ADDQ R15, DX mulAvxGFNI_2x9Xor_loop: // Load 9 outputs VMOVDQU (DI), Y5 VMOVDQU (R8), Y6 VMOVDQU (R9), Y7 VMOVDQU (R10), Y8 VMOVDQU (R11), Y9 VMOVDQU (R12), Y10 VMOVDQU (R13), Y11 VMOVDQU (R14), Y12 VMOVDQU (SI), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs VMOVDQU Y5, (DI) ADDQ $0x20, DI VMOVDQU Y6, (R8) ADDQ $0x20, R8 VMOVDQU Y7, (R9) ADDQ $0x20, R9 VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x9Xor_loop VZEROUPPER mulAvxGFNI_2x9Xor_end: RET // func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), SI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, SI // Add start offset to input ADDQ R15, BX ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X9 VPBROADCASTB X9, Y9 mulAvxTwo_2x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (DI), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (SI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x9Xor_loop VZEROUPPER mulAvxTwo_2x9Xor_end: RET // func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 55 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, BX ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 mulAvxTwo_2x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (R15) ADDQ $0x20, R15 VMOVDQU Y9, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x10_loop VZEROUPPER mulAvxTwo_2x10_end: RET // func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), R14 MOVQ 216(BX), BX MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, BX // Add start offset to input ADDQ R15, DX ADDQ R15, CX mulGFNI_2x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (CX), Z30 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs VMOVDQU64 Z20, (SI) ADDQ $0x40, SI VMOVDQU64 Z21, (DI) ADDQ $0x40, DI VMOVDQU64 Z22, (R8) ADDQ $0x40, R8 VMOVDQU64 Z23, (R9) ADDQ $0x40, R9 VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x10_64_loop VZEROUPPER mulGFNI_2x10_64_end: RET // func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x10(SB), $8-88 // Loading 4 of 20 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, BX ADDQ BP, DX mulAvxGFNI_2x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs VMOVDQU Y4, (DI) ADDQ $0x20, DI VMOVDQU Y5, (R8) ADDQ $0x20, R8 VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x10_loop VZEROUPPER mulAvxGFNI_2x10_end: RET // func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_2x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), R14 MOVQ 216(BX), BX MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, BX // Add start offset to input ADDQ R15, DX ADDQ R15, CX mulGFNI_2x10_64Xor_loop: // Load 10 outputs VMOVDQU64 (SI), Z20 VMOVDQU64 (DI), Z21 VMOVDQU64 (R8), Z22 VMOVDQU64 (R9), Z23 VMOVDQU64 (R10), Z24 VMOVDQU64 (R11), Z25 VMOVDQU64 (R12), Z26 VMOVDQU64 (R13), Z27 VMOVDQU64 (R14), Z28 VMOVDQU64 (BX), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (CX), Z30 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs VMOVDQU64 Z20, (SI) ADDQ $0x40, SI VMOVDQU64 Z21, (DI) ADDQ $0x40, DI VMOVDQU64 Z22, (R8) ADDQ $0x40, R8 VMOVDQU64 Z23, (R9) ADDQ $0x40, R9 VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (BX) ADDQ $0x40, BX // Prepare for next loop DECQ AX JNZ mulGFNI_2x10_64Xor_loop VZEROUPPER mulGFNI_2x10_64Xor_end: RET // func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88 // Loading 4 of 20 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_2x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, BX ADDQ BP, DX mulAvxGFNI_2x10Xor_loop: // Load 10 outputs VMOVDQU (DI), Y4 VMOVDQU (R8), Y5 VMOVDQU (R9), Y6 VMOVDQU (R10), Y7 VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (SI), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs VMOVDQU Y4, (DI) ADDQ $0x20, DI VMOVDQU Y5, (R8) ADDQ $0x20, R8 VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_2x10Xor_loop VZEROUPPER mulAvxGFNI_2x10Xor_end: RET // func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 55 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, BX ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 mulAvxTwo_2x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (DI), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (R15) ADDQ $0x20, R15 VMOVDQU Y9, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x10Xor_loop VZEROUPPER mulAvxTwo_2x10Xor_end: RET // func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x1_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input ADDQ R8, BX ADDQ R8, SI ADDQ R8, DX MOVQ $0x0000000f, R8 MOVQ R8, X2 VPBROADCASTB X2, Y2 mulAvxTwo_3x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y0 VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x1_64_loop VZEROUPPER mulAvxTwo_3x1_64_end: RET // func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), SI MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI // Add start offset to input ADDQ DI, DX ADDQ DI, BX ADDQ DI, CX mulGFNI_3x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z4 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z4, Z3 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z4 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 VXORPD Z3, Z4, Z3 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (CX), Z4 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 VXORPD Z3, Z4, Z3 // Store 1 outputs VMOVDQU64 Z3, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x1_64_loop VZEROUPPER mulGFNI_3x1_64_end: RET // func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), SI MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI // Add start offset to input ADDQ DI, DX ADDQ DI, BX ADDQ DI, CX mulAvxGFNI_3x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 VXORPD Y3, Y4, Y3 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y4 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 VXORPD Y3, Y4, Y3 // Store 1 outputs VMOVDQU Y3, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x1_loop VZEROUPPER mulAvxGFNI_3x1_end: RET // func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), SI MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI // Add start offset to input ADDQ DI, DX ADDQ DI, BX ADDQ DI, CX mulGFNI_3x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (SI), Z3 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z4 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z4, Z4 VXORPD Z3, Z4, Z3 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z4 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 VXORPD Z3, Z4, Z3 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (CX), Z4 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 VXORPD Z3, Z4, Z3 // Store 1 outputs VMOVDQU64 Z3, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x1_64Xor_loop VZEROUPPER mulGFNI_3x1_64Xor_end: RET // func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), SI MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI // Add start offset to input ADDQ DI, DX ADDQ DI, BX ADDQ DI, CX mulAvxGFNI_3x1Xor_loop: // Load 1 outputs VMOVDQU (SI), Y3 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 VXORPD Y3, Y4, Y3 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 VXORPD Y3, Y4, Y3 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y4 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 VXORPD Y3, Y4, Y3 // Store 1 outputs VMOVDQU Y3, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x1Xor_loop VZEROUPPER mulAvxGFNI_3x1Xor_end: RET // func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input ADDQ R8, BX ADDQ R8, SI ADDQ R8, DX MOVQ $0x0000000f, R8 MOVQ R8, X2 VPBROADCASTB X2, Y2 mulAvxTwo_3x1_64Xor_loop: // Load 1 outputs VMOVDQU (DI), Y0 VMOVDQU 32(DI), Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x1_64Xor_loop VZEROUPPER mulAvxTwo_3x1_64Xor_end: RET // func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 33 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 ADDQ R9, DI // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X4 VPBROADCASTB X4, Y4 mulAvxTwo_3x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) ADDQ $0x40, R8 VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x2_64_loop VZEROUPPER mulAvxTwo_3x2_64_end: RET // func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI ADDQ R8, SI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, CX mulGFNI_3x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z8 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z8, Z6 VGF2P8AFFINEQB $0x00, Z1, Z8, Z7 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z8 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 VXORPD Z6, Z9, Z6 VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 VXORPD Z7, Z9, Z7 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (CX), Z8 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 VXORPD Z6, Z9, Z6 VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 VXORPD Z7, Z9, Z7 // Store 2 outputs VMOVDQU64 Z6, (DI) ADDQ $0x40, DI VMOVDQU64 Z7, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x2_64_loop VZEROUPPER mulGFNI_3x2_64_end: RET // func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x2(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI ADDQ R8, SI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, CX mulAvxGFNI_3x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 VXORPD Y6, Y9, Y6 VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 VXORPD Y7, Y9, Y7 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 VXORPD Y6, Y9, Y6 VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 VXORPD Y7, Y9, Y7 // Store 2 outputs VMOVDQU Y6, (DI) ADDQ $0x20, DI VMOVDQU Y7, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x2_loop VZEROUPPER mulAvxGFNI_3x2_end: RET // func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI ADDQ R8, SI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, CX mulGFNI_3x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (DI), Z6 VMOVDQU64 (SI), Z7 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z8 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 VXORPD Z6, Z9, Z6 VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 VXORPD Z7, Z9, Z7 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z8 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 VXORPD Z6, Z9, Z6 VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 VXORPD Z7, Z9, Z7 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (CX), Z8 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 VXORPD Z6, Z9, Z6 VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 VXORPD Z7, Z9, Z7 // Store 2 outputs VMOVDQU64 Z6, (DI) ADDQ $0x40, DI VMOVDQU64 Z7, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x2_64Xor_loop VZEROUPPER mulGFNI_3x2_64Xor_end: RET // func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI ADDQ R8, SI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, CX mulAvxGFNI_3x2Xor_loop: // Load 2 outputs VMOVDQU (DI), Y6 VMOVDQU (SI), Y7 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 VXORPD Y6, Y9, Y6 VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 VXORPD Y7, Y9, Y7 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 VXORPD Y6, Y9, Y6 VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 VXORPD Y7, Y9, Y7 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 VXORPD Y6, Y9, Y6 VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 VXORPD Y7, Y9, Y7 // Store 2 outputs VMOVDQU Y6, (DI) ADDQ $0x20, DI VMOVDQU Y7, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x2Xor_loop VZEROUPPER mulAvxGFNI_3x2Xor_end: RET // func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 33 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 ADDQ R9, DI // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X4 VPBROADCASTB X4, Y4 mulAvxTwo_3x2_64Xor_loop: // Load 2 outputs VMOVDQU (R8), Y0 VMOVDQU 32(R8), Y1 VMOVDQU (DI), Y2 VMOVDQU 32(DI), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) ADDQ $0x40, R8 VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x2_64Xor_loop VZEROUPPER mulAvxTwo_3x2_64Xor_end: RET // func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DI // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X6 VPBROADCASTB X6, Y6 mulAvxTwo_3x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) ADDQ $0x40, R8 VMOVDQU Y2, (R9) VMOVDQU Y3, 32(R9) ADDQ $0x40, R9 VMOVDQU Y4, (DI) VMOVDQU Y5, 32(DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x3_64_loop VZEROUPPER mulAvxTwo_3x3_64_end: RET // func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, DI ADDQ R9, R8 ADDQ R9, SI // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, CX mulGFNI_3x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z12 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z12, Z9 VGF2P8AFFINEQB $0x00, Z1, Z12, Z10 VGF2P8AFFINEQB $0x00, Z2, Z12, Z11 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z12 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (CX), Z12 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 VXORPD Z11, Z13, Z11 // Store 3 outputs VMOVDQU64 Z9, (DI) ADDQ $0x40, DI VMOVDQU64 Z10, (R8) ADDQ $0x40, R8 VMOVDQU64 Z11, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x3_64_loop VZEROUPPER mulGFNI_3x3_64_end: RET // func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x3(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, DI ADDQ R9, R8 ADDQ R9, SI // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, CX mulAvxGFNI_3x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 VXORPD Y11, Y13, Y11 // Store 3 outputs VMOVDQU Y9, (DI) ADDQ $0x20, DI VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x3_loop VZEROUPPER mulAvxGFNI_3x3_end: RET // func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, DI ADDQ R9, R8 ADDQ R9, SI // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, CX mulGFNI_3x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (DI), Z9 VMOVDQU64 (R8), Z10 VMOVDQU64 (SI), Z11 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z12 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z12 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (CX), Z12 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 VXORPD Z9, Z13, Z9 VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 VXORPD Z11, Z13, Z11 // Store 3 outputs VMOVDQU64 Z9, (DI) ADDQ $0x40, DI VMOVDQU64 Z10, (R8) ADDQ $0x40, R8 VMOVDQU64 Z11, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x3_64Xor_loop VZEROUPPER mulGFNI_3x3_64Xor_end: RET // func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, DI ADDQ R9, R8 ADDQ R9, SI // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, CX mulAvxGFNI_3x3Xor_loop: // Load 3 outputs VMOVDQU (DI), Y9 VMOVDQU (R8), Y10 VMOVDQU (SI), Y11 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 VXORPD Y9, Y13, Y9 VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 VXORPD Y11, Y13, Y11 // Store 3 outputs VMOVDQU Y9, (DI) ADDQ $0x20, DI VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x3Xor_loop VZEROUPPER mulAvxGFNI_3x3Xor_end: RET // func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DI // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X6 VPBROADCASTB X6, Y6 mulAvxTwo_3x3_64Xor_loop: // Load 3 outputs VMOVDQU (R8), Y0 VMOVDQU 32(R8), Y1 VMOVDQU (R9), Y2 VMOVDQU 32(R9), Y3 VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) ADDQ $0x40, R8 VMOVDQU Y2, (R9) VMOVDQU Y3, 32(R9) ADDQ $0x40, R9 VMOVDQU Y4, (DI) VMOVDQU Y5, 32(DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x3_64Xor_loop VZEROUPPER mulAvxTwo_3x3_64Xor_end: RET // func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 33 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), DI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DI // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X4 VPBROADCASTB X4, Y4 mulAvxTwo_3x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x4_loop VZEROUPPER mulAvxTwo_3x4_end: RET // func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), SI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, SI // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, CX mulGFNI_3x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z16 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z16, Z12 VGF2P8AFFINEQB $0x00, Z1, Z16, Z13 VGF2P8AFFINEQB $0x00, Z2, Z16, Z14 VGF2P8AFFINEQB $0x00, Z3, Z16, Z15 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z16 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 VXORPD Z12, Z17, Z12 VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 VXORPD Z13, Z17, Z13 VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (CX), Z16 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 VXORPD Z12, Z17, Z12 VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 VXORPD Z13, Z17, Z13 VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 VXORPD Z15, Z17, Z15 // Store 4 outputs VMOVDQU64 Z12, (DI) ADDQ $0x40, DI VMOVDQU64 Z13, (R8) ADDQ $0x40, R8 VMOVDQU64 Z14, (R9) ADDQ $0x40, R9 VMOVDQU64 Z15, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x4_64_loop VZEROUPPER mulGFNI_3x4_64_end: RET // func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x4(SB), $0-88 // Loading 10 of 12 tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), DI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DI // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DX mulAvxGFNI_3x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (R9) ADDQ $0x20, R9 VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x4_loop VZEROUPPER mulAvxGFNI_3x4_end: RET // func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), SI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, SI // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, CX mulGFNI_3x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (DI), Z12 VMOVDQU64 (R8), Z13 VMOVDQU64 (R9), Z14 VMOVDQU64 (SI), Z15 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z16 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 VXORPD Z12, Z17, Z12 VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 VXORPD Z13, Z17, Z13 VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z16 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 VXORPD Z12, Z17, Z12 VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 VXORPD Z13, Z17, Z13 VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (CX), Z16 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 VXORPD Z12, Z17, Z12 VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 VXORPD Z13, Z17, Z13 VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 VXORPD Z15, Z17, Z15 // Store 4 outputs VMOVDQU64 Z12, (DI) ADDQ $0x40, DI VMOVDQU64 Z13, (R8) ADDQ $0x40, R8 VMOVDQU64 Z14, (R9) ADDQ $0x40, R9 VMOVDQU64 Z15, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x4_64Xor_loop VZEROUPPER mulGFNI_3x4_64Xor_end: RET // func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88 // Loading 10 of 12 tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), DI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DI // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DX mulAvxGFNI_3x4Xor_loop: // Load 4 outputs VMOVDQU (R8), Y10 VMOVDQU (R9), Y11 VMOVDQU (R10), Y12 VMOVDQU (DI), Y13 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R8) ADDQ $0x20, R8 VMOVDQU Y11, (R9) ADDQ $0x20, R9 VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x4Xor_loop VZEROUPPER mulAvxGFNI_3x4Xor_end: RET // func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 33 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), DI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DI // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X4 VPBROADCASTB X4, Y4 mulAvxTwo_3x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (R8), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (DI), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x4Xor_loop VZEROUPPER mulAvxTwo_3x4Xor_end: RET // func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 40 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), DI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DI // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X5 VPBROADCASTB X5, Y5 mulAvxTwo_3x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x5_loop VZEROUPPER mulAvxTwo_3x5_end: RET // func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), SI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, SI // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, CX mulGFNI_3x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z20 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z20, Z15 VGF2P8AFFINEQB $0x00, Z1, Z20, Z16 VGF2P8AFFINEQB $0x00, Z2, Z20, Z17 VGF2P8AFFINEQB $0x00, Z3, Z20, Z18 VGF2P8AFFINEQB $0x00, Z4, Z20, Z19 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (BX), Z20 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 VXORPD Z15, Z21, Z15 VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (CX), Z20 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 VXORPD Z15, Z21, Z15 VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 VXORPD Z19, Z21, Z19 // Store 5 outputs VMOVDQU64 Z15, (DI) ADDQ $0x40, DI VMOVDQU64 Z16, (R8) ADDQ $0x40, R8 VMOVDQU64 Z17, (R9) ADDQ $0x40, R9 VMOVDQU64 Z18, (R10) ADDQ $0x40, R10 VMOVDQU64 Z19, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x5_64_loop VZEROUPPER mulGFNI_3x5_64_end: RET // func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x5(SB), $0-88 // Loading 9 of 15 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), DI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DI // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DX mulAvxGFNI_3x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R8) ADDQ $0x20, R8 VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (R10) ADDQ $0x20, R10 VMOVDQU Y12, (R11) ADDQ $0x20, R11 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x5_loop VZEROUPPER mulAvxGFNI_3x5_end: RET // func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), SI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, SI // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, CX mulGFNI_3x5_64Xor_loop: // Load 5 outputs VMOVDQU64 (DI), Z15 VMOVDQU64 (R8), Z16 VMOVDQU64 (R9), Z17 VMOVDQU64 (R10), Z18 VMOVDQU64 (SI), Z19 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z20 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 VXORPD Z15, Z21, Z15 VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (BX), Z20 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 VXORPD Z15, Z21, Z15 VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (CX), Z20 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 VXORPD Z15, Z21, Z15 VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 VXORPD Z19, Z21, Z19 // Store 5 outputs VMOVDQU64 Z15, (DI) ADDQ $0x40, DI VMOVDQU64 Z16, (R8) ADDQ $0x40, R8 VMOVDQU64 Z17, (R9) ADDQ $0x40, R9 VMOVDQU64 Z18, (R10) ADDQ $0x40, R10 VMOVDQU64 Z19, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x5_64Xor_loop VZEROUPPER mulGFNI_3x5_64Xor_end: RET // func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88 // Loading 9 of 15 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), DI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DI // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DX mulAvxGFNI_3x5Xor_loop: // Load 5 outputs VMOVDQU (R8), Y9 VMOVDQU (R9), Y10 VMOVDQU (R10), Y11 VMOVDQU (R11), Y12 VMOVDQU (DI), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R8) ADDQ $0x20, R8 VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (R10) ADDQ $0x20, R10 VMOVDQU Y12, (R11) ADDQ $0x20, R11 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x5Xor_loop VZEROUPPER mulAvxGFNI_3x5Xor_end: RET // func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 40 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), DI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DI // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X5 VPBROADCASTB X5, Y5 mulAvxTwo_3x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (R8), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (DI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x5Xor_loop VZEROUPPER mulAvxTwo_3x5Xor_end: RET // func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), DI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DI // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X6 VPBROADCASTB X6, Y6 mulAvxTwo_3x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x6_loop VZEROUPPER mulAvxTwo_3x6_end: RET // func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), SI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, SI // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, CX mulGFNI_3x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z24 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z24, Z18 VGF2P8AFFINEQB $0x00, Z1, Z24, Z19 VGF2P8AFFINEQB $0x00, Z2, Z24, Z20 VGF2P8AFFINEQB $0x00, Z3, Z24, Z21 VGF2P8AFFINEQB $0x00, Z4, Z24, Z22 VGF2P8AFFINEQB $0x00, Z5, Z24, Z23 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (BX), Z24 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 VXORPD Z18, Z25, Z18 VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 VXORPD Z19, Z25, Z19 VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (CX), Z24 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 VXORPD Z18, Z25, Z18 VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 VXORPD Z19, Z25, Z19 VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 VXORPD Z23, Z25, Z23 // Store 6 outputs VMOVDQU64 Z18, (DI) ADDQ $0x40, DI VMOVDQU64 Z19, (R8) ADDQ $0x40, R8 VMOVDQU64 Z20, (R9) ADDQ $0x40, R9 VMOVDQU64 Z21, (R10) ADDQ $0x40, R10 VMOVDQU64 Z22, (R11) ADDQ $0x40, R11 VMOVDQU64 Z23, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x6_64_loop VZEROUPPER mulGFNI_3x6_64_end: RET // func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x6(SB), $0-88 // Loading 8 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), DI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DI // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DX mulAvxGFNI_3x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (R9) ADDQ $0x20, R9 VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x6_loop VZEROUPPER mulAvxGFNI_3x6_end: RET // func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), SI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, SI // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, CX mulGFNI_3x6_64Xor_loop: // Load 6 outputs VMOVDQU64 (DI), Z18 VMOVDQU64 (R8), Z19 VMOVDQU64 (R9), Z20 VMOVDQU64 (R10), Z21 VMOVDQU64 (R11), Z22 VMOVDQU64 (SI), Z23 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z24 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 VXORPD Z18, Z25, Z18 VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 VXORPD Z19, Z25, Z19 VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (BX), Z24 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 VXORPD Z18, Z25, Z18 VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 VXORPD Z19, Z25, Z19 VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (CX), Z24 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 VXORPD Z18, Z25, Z18 VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 VXORPD Z19, Z25, Z19 VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 VXORPD Z23, Z25, Z23 // Store 6 outputs VMOVDQU64 Z18, (DI) ADDQ $0x40, DI VMOVDQU64 Z19, (R8) ADDQ $0x40, R8 VMOVDQU64 Z20, (R9) ADDQ $0x40, R9 VMOVDQU64 Z21, (R10) ADDQ $0x40, R10 VMOVDQU64 Z22, (R11) ADDQ $0x40, R11 VMOVDQU64 Z23, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x6_64Xor_loop VZEROUPPER mulGFNI_3x6_64Xor_end: RET // func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88 // Loading 8 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), DI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DI // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DX mulAvxGFNI_3x6Xor_loop: // Load 6 outputs VMOVDQU (R8), Y8 VMOVDQU (R9), Y9 VMOVDQU (R10), Y10 VMOVDQU (R11), Y11 VMOVDQU (R12), Y12 VMOVDQU (DI), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (R9) ADDQ $0x20, R9 VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x6Xor_loop VZEROUPPER mulAvxGFNI_3x6Xor_end: RET // func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), DI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DI // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X6 VPBROADCASTB X6, Y6 mulAvxTwo_3x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (R8), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (DI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x6Xor_loop VZEROUPPER mulAvxTwo_3x6Xor_end: RET // func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 54 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), DI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, DI // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X7 VPBROADCASTB X7, Y7 mulAvxTwo_3x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x7_loop VZEROUPPER mulAvxTwo_3x7_end: RET // func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), SI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, SI // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, CX mulGFNI_3x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (DX), Z28 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z28, Z21 VGF2P8AFFINEQB $0x00, Z1, Z28, Z22 VGF2P8AFFINEQB $0x00, Z2, Z28, Z23 VGF2P8AFFINEQB $0x00, Z3, Z28, Z24 VGF2P8AFFINEQB $0x00, Z4, Z28, Z25 VGF2P8AFFINEQB $0x00, Z5, Z28, Z26 VGF2P8AFFINEQB $0x00, Z6, Z28, Z27 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (BX), Z28 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 VXORPD Z21, Z29, Z21 VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 VXORPD Z22, Z29, Z22 VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 VXORPD Z23, Z29, Z23 VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (CX), Z28 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 VXORPD Z21, Z29, Z21 VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 VXORPD Z22, Z29, Z22 VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 VXORPD Z23, Z29, Z23 VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 VXORPD Z27, Z29, Z27 // Store 7 outputs VMOVDQU64 Z21, (DI) ADDQ $0x40, DI VMOVDQU64 Z22, (R8) ADDQ $0x40, R8 VMOVDQU64 Z23, (R9) ADDQ $0x40, R9 VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x7_64_loop VZEROUPPER mulGFNI_3x7_64_end: RET // func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x7(SB), $0-88 // Loading 7 of 21 tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), DI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, DI // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DX mulAvxGFNI_3x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (R8) ADDQ $0x20, R8 VMOVDQU Y8, (R9) ADDQ $0x20, R9 VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x7_loop VZEROUPPER mulAvxGFNI_3x7_end: RET // func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), SI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, SI // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, CX mulGFNI_3x7_64Xor_loop: // Load 7 outputs VMOVDQU64 (DI), Z21 VMOVDQU64 (R8), Z22 VMOVDQU64 (R9), Z23 VMOVDQU64 (R10), Z24 VMOVDQU64 (R11), Z25 VMOVDQU64 (R12), Z26 VMOVDQU64 (SI), Z27 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (DX), Z28 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 VXORPD Z21, Z29, Z21 VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 VXORPD Z22, Z29, Z22 VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 VXORPD Z23, Z29, Z23 VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (BX), Z28 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 VXORPD Z21, Z29, Z21 VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 VXORPD Z22, Z29, Z22 VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 VXORPD Z23, Z29, Z23 VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (CX), Z28 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 VXORPD Z21, Z29, Z21 VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 VXORPD Z22, Z29, Z22 VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 VXORPD Z23, Z29, Z23 VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 VXORPD Z27, Z29, Z27 // Store 7 outputs VMOVDQU64 Z21, (DI) ADDQ $0x40, DI VMOVDQU64 Z22, (R8) ADDQ $0x40, R8 VMOVDQU64 Z23, (R9) ADDQ $0x40, R9 VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ AX JNZ mulGFNI_3x7_64Xor_loop VZEROUPPER mulGFNI_3x7_64Xor_end: RET // func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88 // Loading 7 of 21 tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), DI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, DI // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DX mulAvxGFNI_3x7Xor_loop: // Load 7 outputs VMOVDQU (R8), Y7 VMOVDQU (R9), Y8 VMOVDQU (R10), Y9 VMOVDQU (R11), Y10 VMOVDQU (R12), Y11 VMOVDQU (R13), Y12 VMOVDQU (DI), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (R8) ADDQ $0x20, R8 VMOVDQU Y8, (R9) ADDQ $0x20, R9 VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x7Xor_loop VZEROUPPER mulAvxGFNI_3x7Xor_end: RET // func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 54 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), DI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, DI // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X7 VPBROADCASTB X7, Y7 mulAvxTwo_3x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (R8), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (DI), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x7Xor_loop VZEROUPPER mulAvxTwo_3x7Xor_end: RET // func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 61 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), DI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, DI // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X8 VPBROADCASTB X8, Y8 mulAvxTwo_3x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x8_loop VZEROUPPER mulAvxTwo_3x8_end: RET // func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64(SB), $0-88 // Loading 22 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), DI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, DI // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DX mulGFNI_3x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs VMOVDQU64 Z22, (R8) ADDQ $0x40, R8 VMOVDQU64 Z23, (R9) ADDQ $0x40, R9 VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_3x8_64_loop VZEROUPPER mulGFNI_3x8_64_end: RET // func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x8(SB), $0-88 // Loading 6 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), DI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, DI // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DX mulAvxGFNI_3x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (R8) ADDQ $0x20, R8 VMOVDQU Y7, (R9) ADDQ $0x20, R9 VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x8_loop VZEROUPPER mulAvxGFNI_3x8_end: RET // func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 // Loading 22 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), DI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, DI // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DX mulGFNI_3x8_64Xor_loop: // Load 8 outputs VMOVDQU64 (R8), Z22 VMOVDQU64 (R9), Z23 VMOVDQU64 (R10), Z24 VMOVDQU64 (R11), Z25 VMOVDQU64 (R12), Z26 VMOVDQU64 (R13), Z27 VMOVDQU64 (R14), Z28 VMOVDQU64 (DI), Z29 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs VMOVDQU64 Z22, (R8) ADDQ $0x40, R8 VMOVDQU64 Z23, (R9) ADDQ $0x40, R9 VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_3x8_64Xor_loop VZEROUPPER mulGFNI_3x8_64Xor_end: RET // func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88 // Loading 6 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), DI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, DI // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DX mulAvxGFNI_3x8Xor_loop: // Load 8 outputs VMOVDQU (R8), Y6 VMOVDQU (R9), Y7 VMOVDQU (R10), Y8 VMOVDQU (R11), Y9 VMOVDQU (R12), Y10 VMOVDQU (R13), Y11 VMOVDQU (R14), Y12 VMOVDQU (DI), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (R8) ADDQ $0x20, R8 VMOVDQU Y7, (R9) ADDQ $0x20, R9 VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x8Xor_loop VZEROUPPER mulAvxGFNI_3x8Xor_end: RET // func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 61 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), DI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, DI // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X8 VPBROADCASTB X8, Y8 mulAvxTwo_3x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (R8), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (DI), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x8Xor_loop VZEROUPPER mulAvxTwo_3x8Xor_end: RET // func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 mulAvxTwo_3x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (R15) ADDQ $0x20, R15 VMOVDQU Y8, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x9_loop VZEROUPPER mulAvxTwo_3x9_end: RET // func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64(SB), $8-88 // Loading 21 of 27 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DX mulGFNI_3x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs VMOVDQU64 Z21, (R8) ADDQ $0x40, R8 VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_3x9_64_loop VZEROUPPER mulGFNI_3x9_64_end: RET // func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x9(SB), $8-88 // Loading 5 of 27 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DX mulAvxGFNI_3x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs VMOVDQU Y5, (R8) ADDQ $0x20, R8 VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x9_loop VZEROUPPER mulAvxGFNI_3x9_end: RET // func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 // Loading 21 of 27 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DX mulGFNI_3x9_64Xor_loop: // Load 9 outputs VMOVDQU64 (R8), Z21 VMOVDQU64 (R9), Z22 VMOVDQU64 (R10), Z23 VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (DI), Z29 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs VMOVDQU64 Z21, (R8) ADDQ $0x40, R8 VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_3x9_64Xor_loop VZEROUPPER mulGFNI_3x9_64Xor_end: RET // func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88 // Loading 5 of 27 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DX mulAvxGFNI_3x9Xor_loop: // Load 9 outputs VMOVDQU (R8), Y5 VMOVDQU (R9), Y6 VMOVDQU (R10), Y7 VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (DI), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs VMOVDQU Y5, (R8) ADDQ $0x20, R8 VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_3x9Xor_loop VZEROUPPER mulAvxGFNI_3x9Xor_end: RET // func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 mulAvxTwo_3x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (R8), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (R15), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (DI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (R15) ADDQ $0x20, R15 VMOVDQU Y8, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x9Xor_loop VZEROUPPER mulAvxTwo_3x9Xor_end: RET // func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 75 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x10_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_3x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 ADDQ $0x20, AX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (R15) ADDQ $0x20, R15 VMOVDQU Y9, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ BP JNZ mulAvxTwo_3x10_loop VZEROUPPER mulAvxTwo_3x10_end: RET // func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64(SB), $8-88 // Loading 20 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_3x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs VMOVDQU64 Z20, (DI) ADDQ $0x40, DI VMOVDQU64 Z21, (R8) ADDQ $0x40, R8 VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ BP JNZ mulGFNI_3x10_64_loop VZEROUPPER mulGFNI_3x10_64_end: RET // func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x10(SB), $8-88 // Loading 4 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_3x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs VMOVDQU Y4, (DI) ADDQ $0x20, DI VMOVDQU Y5, (R8) ADDQ $0x20, R8 VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ BP JNZ mulAvxGFNI_3x10_loop VZEROUPPER mulAvxGFNI_3x10_end: RET // func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 // Loading 20 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_3x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_3x10_64Xor_loop: // Load 10 outputs VMOVDQU64 (DI), Z20 VMOVDQU64 (R8), Z21 VMOVDQU64 (R9), Z22 VMOVDQU64 (R10), Z23 VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (SI), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs VMOVDQU64 Z20, (DI) ADDQ $0x40, DI VMOVDQU64 Z21, (R8) ADDQ $0x40, R8 VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (SI) ADDQ $0x40, SI // Prepare for next loop DECQ BP JNZ mulGFNI_3x10_64Xor_loop VZEROUPPER mulGFNI_3x10_64Xor_end: RET // func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88 // Loading 4 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_3x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_3x10Xor_loop: // Load 10 outputs VMOVDQU (DI), Y4 VMOVDQU (R8), Y5 VMOVDQU (R9), Y6 VMOVDQU (R10), Y7 VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (SI), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs VMOVDQU Y4, (DI) ADDQ $0x20, DI VMOVDQU Y5, (R8) ADDQ $0x20, R8 VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ BP JNZ mulAvxGFNI_3x10Xor_loop VZEROUPPER mulAvxGFNI_3x10Xor_end: RET // func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 75 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x10Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_3x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (DI), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 ADDQ $0x20, AX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (R15) ADDQ $0x20, R15 VMOVDQU Y9, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ BP JNZ mulAvxTwo_3x10Xor_loop VZEROUPPER mulAvxTwo_3x10Xor_end: RET // func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x1_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X2 VPBROADCASTB X2, Y2 mulAvxTwo_4x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y0 VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x1_64_loop VZEROUPPER mulAvxTwo_4x1_64_end: RET // func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 7 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, SI ADDQ R8, CX mulGFNI_4x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z5 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z5, Z4 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z5 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 VXORPD Z4, Z5, Z4 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z5 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 VXORPD Z4, Z5, Z4 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (CX), Z5 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 VXORPD Z4, Z5, Z4 // Store 1 outputs VMOVDQU64 Z4, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x1_64_loop VZEROUPPER mulGFNI_4x1_64_end: RET // func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 7 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, SI ADDQ R8, CX mulAvxGFNI_4x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y5 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 VXORPD Y4, Y5, Y4 // Store 1 outputs VMOVDQU Y4, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x1_loop VZEROUPPER mulAvxGFNI_4x1_end: RET // func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 7 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, SI ADDQ R8, CX mulGFNI_4x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (DI), Z4 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z5 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z5, Z5 VXORPD Z4, Z5, Z4 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z5 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 VXORPD Z4, Z5, Z4 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z5 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 VXORPD Z4, Z5, Z4 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (CX), Z5 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 VXORPD Z4, Z5, Z4 // Store 1 outputs VMOVDQU64 Z4, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x1_64Xor_loop VZEROUPPER mulGFNI_4x1_64Xor_end: RET // func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 7 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, SI ADDQ R8, CX mulAvxGFNI_4x1Xor_loop: // Load 1 outputs VMOVDQU (DI), Y4 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y5 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 VXORPD Y4, Y5, Y4 // Store 1 outputs VMOVDQU Y4, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x1Xor_loop VZEROUPPER mulAvxGFNI_4x1Xor_end: RET // func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X2 VPBROADCASTB X2, Y2 mulAvxTwo_4x1_64Xor_loop: // Load 1 outputs VMOVDQU (R8), Y0 VMOVDQU 32(R8), Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x1_64Xor_loop VZEROUPPER mulAvxTwo_4x1_64Xor_end: RET // func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 41 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 ADDQ R10, R8 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X4 VPBROADCASTB X4, Y4 mulAvxTwo_4x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) ADDQ $0x40, R9 VMOVDQU Y2, (R8) VMOVDQU Y3, 32(R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x2_64_loop VZEROUPPER mulAvxTwo_4x2_64_end: RET // func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 ADDQ R9, DI // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, CX mulGFNI_4x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z10 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z10, Z8 VGF2P8AFFINEQB $0x00, Z1, Z10, Z9 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z10 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 VXORPD Z8, Z11, Z8 VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 VXORPD Z9, Z11, Z9 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z10 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 VXORPD Z8, Z11, Z8 VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 VXORPD Z9, Z11, Z9 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (CX), Z10 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 VXORPD Z8, Z11, Z8 VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 VXORPD Z9, Z11, Z9 // Store 2 outputs VMOVDQU64 Z8, (R8) ADDQ $0x40, R8 VMOVDQU64 Z9, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x2_64_loop VZEROUPPER mulGFNI_4x2_64_end: RET // func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x2(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 ADDQ R9, DI // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, CX mulAvxGFNI_4x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 VXORPD Y8, Y11, Y8 VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 VXORPD Y8, Y11, Y8 VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 VXORPD Y8, Y11, Y8 VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 VXORPD Y9, Y11, Y9 // Store 2 outputs VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x2_loop VZEROUPPER mulAvxGFNI_4x2_end: RET // func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 ADDQ R9, DI // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, CX mulGFNI_4x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (R8), Z8 VMOVDQU64 (DI), Z9 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z10 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 VXORPD Z8, Z11, Z8 VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 VXORPD Z9, Z11, Z9 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z10 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 VXORPD Z8, Z11, Z8 VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 VXORPD Z9, Z11, Z9 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z10 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 VXORPD Z8, Z11, Z8 VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 VXORPD Z9, Z11, Z9 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (CX), Z10 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 VXORPD Z8, Z11, Z8 VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 VXORPD Z9, Z11, Z9 // Store 2 outputs VMOVDQU64 Z8, (R8) ADDQ $0x40, R8 VMOVDQU64 Z9, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x2_64Xor_loop VZEROUPPER mulGFNI_4x2_64Xor_end: RET // func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 ADDQ R9, DI // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, CX mulAvxGFNI_4x2Xor_loop: // Load 2 outputs VMOVDQU (R8), Y8 VMOVDQU (DI), Y9 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 VXORPD Y8, Y11, Y8 VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 VXORPD Y8, Y11, Y8 VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 VXORPD Y8, Y11, Y8 VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 VXORPD Y8, Y11, Y8 VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 VXORPD Y9, Y11, Y9 // Store 2 outputs VMOVDQU Y8, (R8) ADDQ $0x20, R8 VMOVDQU Y9, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x2Xor_loop VZEROUPPER mulAvxGFNI_4x2Xor_end: RET // func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 41 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 ADDQ R10, R8 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X4 VPBROADCASTB X4, Y4 mulAvxTwo_4x2_64Xor_loop: // Load 2 outputs VMOVDQU (R9), Y0 VMOVDQU 32(R9), Y1 VMOVDQU (R8), Y2 VMOVDQU 32(R8), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) ADDQ $0x40, R9 VMOVDQU Y2, (R8) VMOVDQU Y3, 32(R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x2_64Xor_loop VZEROUPPER mulAvxTwo_4x2_64Xor_end: RET // func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, R8 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X6 VPBROADCASTB X6, Y6 mulAvxTwo_4x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) ADDQ $0x40, R9 VMOVDQU Y2, (R10) VMOVDQU Y3, 32(R10) ADDQ $0x40, R10 VMOVDQU Y4, (R8) VMOVDQU Y5, 32(R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x3_64_loop VZEROUPPER mulAvxTwo_4x3_64_end: RET // func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DI // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, CX mulGFNI_4x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z15 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z15, Z12 VGF2P8AFFINEQB $0x00, Z1, Z15, Z13 VGF2P8AFFINEQB $0x00, Z2, Z15, Z14 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z15 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 VXORPD Z14, Z16, Z14 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z15 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 VXORPD Z14, Z16, Z14 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (CX), Z15 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 VXORPD Z14, Z16, Z14 // Store 3 outputs VMOVDQU64 Z12, (R8) ADDQ $0x40, R8 VMOVDQU64 Z13, (R9) ADDQ $0x40, R9 VMOVDQU64 Z14, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x3_64_loop VZEROUPPER mulGFNI_4x3_64_end: RET // func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x3(SB), $0-88 // Loading 11 of 12 tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, R8 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, DX mulAvxGFNI_4x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R9) ADDQ $0x20, R9 VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x3_loop VZEROUPPER mulAvxGFNI_4x3_end: RET // func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DI // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, CX mulGFNI_4x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (R8), Z12 VMOVDQU64 (R9), Z13 VMOVDQU64 (DI), Z14 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z15 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 VXORPD Z14, Z16, Z14 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z15 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 VXORPD Z14, Z16, Z14 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z15 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 VXORPD Z14, Z16, Z14 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (CX), Z15 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 VXORPD Z12, Z16, Z12 VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 VXORPD Z13, Z16, Z13 VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 VXORPD Z14, Z16, Z14 // Store 3 outputs VMOVDQU64 Z12, (R8) ADDQ $0x40, R8 VMOVDQU64 Z13, (R9) ADDQ $0x40, R9 VMOVDQU64 Z14, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x3_64Xor_loop VZEROUPPER mulGFNI_4x3_64Xor_end: RET // func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88 // Loading 11 of 12 tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, R8 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, DX mulAvxGFNI_4x3Xor_loop: // Load 3 outputs VMOVDQU (R9), Y11 VMOVDQU (R10), Y12 VMOVDQU (R8), Y13 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R9) ADDQ $0x20, R9 VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x3Xor_loop VZEROUPPER mulAvxGFNI_4x3Xor_end: RET // func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, R8 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X6 VPBROADCASTB X6, Y6 mulAvxTwo_4x3_64Xor_loop: // Load 3 outputs VMOVDQU (R9), Y0 VMOVDQU 32(R9), Y1 VMOVDQU (R10), Y2 VMOVDQU 32(R10), Y3 VMOVDQU (R8), Y4 VMOVDQU 32(R8), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) ADDQ $0x40, R9 VMOVDQU Y2, (R10) VMOVDQU Y3, 32(R10) ADDQ $0x40, R10 VMOVDQU Y4, (R8) VMOVDQU Y5, 32(R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x3_64Xor_loop VZEROUPPER mulAvxTwo_4x3_64Xor_end: RET // func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 41 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R8 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R8 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X4 VPBROADCASTB X4, Y4 mulAvxTwo_4x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x4_loop VZEROUPPER mulAvxTwo_4x4_end: RET // func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), DI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DI // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, CX mulGFNI_4x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z20 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z20, Z16 VGF2P8AFFINEQB $0x00, Z1, Z20, Z17 VGF2P8AFFINEQB $0x00, Z2, Z20, Z18 VGF2P8AFFINEQB $0x00, Z3, Z20, Z19 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z20 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (SI), Z20 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (CX), Z20 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 VXORPD Z19, Z21, Z19 // Store 4 outputs VMOVDQU64 Z16, (R8) ADDQ $0x40, R8 VMOVDQU64 Z17, (R9) ADDQ $0x40, R9 VMOVDQU64 Z18, (R10) ADDQ $0x40, R10 VMOVDQU64 Z19, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x4_64_loop VZEROUPPER mulGFNI_4x4_64_end: RET // func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x4(SB), $0-88 // Loading 10 of 16 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R8 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R8 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, DX mulAvxGFNI_4x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (R10) ADDQ $0x20, R10 VMOVDQU Y12, (R11) ADDQ $0x20, R11 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x4_loop VZEROUPPER mulAvxGFNI_4x4_end: RET // func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), DI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DI // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, CX mulGFNI_4x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (R8), Z16 VMOVDQU64 (R9), Z17 VMOVDQU64 (R10), Z18 VMOVDQU64 (DI), Z19 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z20 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z20 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (SI), Z20 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (CX), Z20 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 VXORPD Z16, Z21, Z16 VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 VXORPD Z17, Z21, Z17 VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 VXORPD Z19, Z21, Z19 // Store 4 outputs VMOVDQU64 Z16, (R8) ADDQ $0x40, R8 VMOVDQU64 Z17, (R9) ADDQ $0x40, R9 VMOVDQU64 Z18, (R10) ADDQ $0x40, R10 VMOVDQU64 Z19, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x4_64Xor_loop VZEROUPPER mulGFNI_4x4_64Xor_end: RET // func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88 // Loading 10 of 16 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R8 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R8 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, DX mulAvxGFNI_4x4Xor_loop: // Load 4 outputs VMOVDQU (R9), Y10 VMOVDQU (R10), Y11 VMOVDQU (R11), Y12 VMOVDQU (R8), Y13 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (R10) ADDQ $0x20, R10 VMOVDQU Y12, (R11) ADDQ $0x20, R11 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x4Xor_loop VZEROUPPER mulAvxGFNI_4x4Xor_end: RET // func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 41 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R8 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R8 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X4 VPBROADCASTB X4, Y4 mulAvxTwo_4x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (R9), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R8), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x4Xor_loop VZEROUPPER mulAvxTwo_4x4Xor_end: RET // func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R8 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R8 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X5 VPBROADCASTB X5, Y5 mulAvxTwo_4x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x5_loop VZEROUPPER mulAvxTwo_4x5_end: RET // func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 27 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), DI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DI // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, CX mulGFNI_4x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z25 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z25, Z20 VGF2P8AFFINEQB $0x00, Z1, Z25, Z21 VGF2P8AFFINEQB $0x00, Z2, Z25, Z22 VGF2P8AFFINEQB $0x00, Z3, Z25, Z23 VGF2P8AFFINEQB $0x00, Z4, Z25, Z24 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (BX), Z25 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 VXORPD Z20, Z26, Z20 VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 VXORPD Z21, Z26, Z21 VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 VXORPD Z22, Z26, Z22 VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 VXORPD Z23, Z26, Z23 VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 VXORPD Z24, Z26, Z24 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (SI), Z25 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 VXORPD Z20, Z26, Z20 VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 VXORPD Z21, Z26, Z21 VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 VXORPD Z22, Z26, Z22 VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 VXORPD Z23, Z26, Z23 VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 VXORPD Z24, Z26, Z24 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (CX), Z25 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 VXORPD Z20, Z26, Z20 VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 VXORPD Z21, Z26, Z21 VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 VXORPD Z22, Z26, Z22 VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 VXORPD Z23, Z26, Z23 VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 VXORPD Z24, Z26, Z24 // Store 5 outputs VMOVDQU64 Z20, (R8) ADDQ $0x40, R8 VMOVDQU64 Z21, (R9) ADDQ $0x40, R9 VMOVDQU64 Z22, (R10) ADDQ $0x40, R10 VMOVDQU64 Z23, (R11) ADDQ $0x40, R11 VMOVDQU64 Z24, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x5_64_loop VZEROUPPER mulGFNI_4x5_64_end: RET // func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x5(SB), $0-88 // Loading 9 of 20 tables to registers // Destination kept in GP registers // Full registers estimated 27 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R8 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R8 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, DX mulAvxGFNI_4x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R9) ADDQ $0x20, R9 VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x5_loop VZEROUPPER mulAvxGFNI_4x5_end: RET // func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 27 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), DI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DI // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, CX mulGFNI_4x5_64Xor_loop: // Load 5 outputs VMOVDQU64 (R8), Z20 VMOVDQU64 (R9), Z21 VMOVDQU64 (R10), Z22 VMOVDQU64 (R11), Z23 VMOVDQU64 (DI), Z24 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z25 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z25, Z26 VXORPD Z20, Z26, Z20 VGF2P8AFFINEQB $0x00, Z1, Z25, Z26 VXORPD Z21, Z26, Z21 VGF2P8AFFINEQB $0x00, Z2, Z25, Z26 VXORPD Z22, Z26, Z22 VGF2P8AFFINEQB $0x00, Z3, Z25, Z26 VXORPD Z23, Z26, Z23 VGF2P8AFFINEQB $0x00, Z4, Z25, Z26 VXORPD Z24, Z26, Z24 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (BX), Z25 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 VXORPD Z20, Z26, Z20 VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 VXORPD Z21, Z26, Z21 VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 VXORPD Z22, Z26, Z22 VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 VXORPD Z23, Z26, Z23 VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 VXORPD Z24, Z26, Z24 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (SI), Z25 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 VXORPD Z20, Z26, Z20 VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 VXORPD Z21, Z26, Z21 VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 VXORPD Z22, Z26, Z22 VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 VXORPD Z23, Z26, Z23 VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 VXORPD Z24, Z26, Z24 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (CX), Z25 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 VXORPD Z20, Z26, Z20 VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 VXORPD Z21, Z26, Z21 VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 VXORPD Z22, Z26, Z22 VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 VXORPD Z23, Z26, Z23 VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 VXORPD Z24, Z26, Z24 // Store 5 outputs VMOVDQU64 Z20, (R8) ADDQ $0x40, R8 VMOVDQU64 Z21, (R9) ADDQ $0x40, R9 VMOVDQU64 Z22, (R10) ADDQ $0x40, R10 VMOVDQU64 Z23, (R11) ADDQ $0x40, R11 VMOVDQU64 Z24, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x5_64Xor_loop VZEROUPPER mulGFNI_4x5_64Xor_end: RET // func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88 // Loading 9 of 20 tables to registers // Destination kept in GP registers // Full registers estimated 27 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R8 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R8 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, DX mulAvxGFNI_4x5Xor_loop: // Load 5 outputs VMOVDQU (R9), Y9 VMOVDQU (R10), Y10 VMOVDQU (R11), Y11 VMOVDQU (R12), Y12 VMOVDQU (R8), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R9) ADDQ $0x20, R9 VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x5Xor_loop VZEROUPPER mulAvxGFNI_4x5Xor_end: RET // func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R8 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R8 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X5 VPBROADCASTB X5, Y5 mulAvxTwo_4x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (R9), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R8), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x5Xor_loop VZEROUPPER mulAvxTwo_4x5Xor_end: RET // func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 59 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R8 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R8 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 mulAvxTwo_4x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x6_loop VZEROUPPER mulAvxTwo_4x6_end: RET // func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), DI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DI // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, CX mulGFNI_4x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (CX), Z30 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs VMOVDQU64 Z24, (R8) ADDQ $0x40, R8 VMOVDQU64 Z25, (R9) ADDQ $0x40, R9 VMOVDQU64 Z26, (R10) ADDQ $0x40, R10 VMOVDQU64 Z27, (R11) ADDQ $0x40, R11 VMOVDQU64 Z28, (R12) ADDQ $0x40, R12 VMOVDQU64 Z29, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x6_64_loop VZEROUPPER mulGFNI_4x6_64_end: RET // func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x6(SB), $0-88 // Loading 8 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R8 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R8 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, DX mulAvxGFNI_4x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R9) ADDQ $0x20, R9 VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x6_loop VZEROUPPER mulAvxGFNI_4x6_end: RET // func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), DI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DI // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, CX mulGFNI_4x6_64Xor_loop: // Load 6 outputs VMOVDQU64 (R8), Z24 VMOVDQU64 (R9), Z25 VMOVDQU64 (R10), Z26 VMOVDQU64 (R11), Z27 VMOVDQU64 (R12), Z28 VMOVDQU64 (DI), Z29 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (CX), Z30 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs VMOVDQU64 Z24, (R8) ADDQ $0x40, R8 VMOVDQU64 Z25, (R9) ADDQ $0x40, R9 VMOVDQU64 Z26, (R10) ADDQ $0x40, R10 VMOVDQU64 Z27, (R11) ADDQ $0x40, R11 VMOVDQU64 Z28, (R12) ADDQ $0x40, R12 VMOVDQU64 Z29, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ AX JNZ mulGFNI_4x6_64Xor_loop VZEROUPPER mulGFNI_4x6_64Xor_end: RET // func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88 // Loading 8 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R8 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R8 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, DX mulAvxGFNI_4x6Xor_loop: // Load 6 outputs VMOVDQU (R9), Y8 VMOVDQU (R10), Y9 VMOVDQU (R11), Y10 VMOVDQU (R12), Y11 VMOVDQU (R13), Y12 VMOVDQU (R8), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R9) ADDQ $0x20, R9 VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x6Xor_loop VZEROUPPER mulAvxGFNI_4x6Xor_end: RET // func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 59 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R8 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R8 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 mulAvxTwo_4x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (R9), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (R8), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x6Xor_loop VZEROUPPER mulAvxTwo_4x6Xor_end: RET // func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R8 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R8 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X7 VPBROADCASTB X7, Y7 mulAvxTwo_4x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x7_loop VZEROUPPER mulAvxTwo_4x7_end: RET // func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64(SB), $0-88 // Loading 23 of 28 tables to registers // Destination kept in GP registers // Full registers estimated 37 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R8 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R8 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, DX mulGFNI_4x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs VMOVDQU64 Z23, (R9) ADDQ $0x40, R9 VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_4x7_64_loop VZEROUPPER mulGFNI_4x7_64_end: RET // func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x7(SB), $0-88 // Loading 7 of 28 tables to registers // Destination kept in GP registers // Full registers estimated 37 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R8 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R8 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, DX mulAvxGFNI_4x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (R9) ADDQ $0x20, R9 VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x7_loop VZEROUPPER mulAvxGFNI_4x7_end: RET // func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 // Loading 23 of 28 tables to registers // Destination kept in GP registers // Full registers estimated 37 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R8 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R8 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, DX mulGFNI_4x7_64Xor_loop: // Load 7 outputs VMOVDQU64 (R9), Z23 VMOVDQU64 (R10), Z24 VMOVDQU64 (R11), Z25 VMOVDQU64 (R12), Z26 VMOVDQU64 (R13), Z27 VMOVDQU64 (R14), Z28 VMOVDQU64 (R8), Z29 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs VMOVDQU64 Z23, (R9) ADDQ $0x40, R9 VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_4x7_64Xor_loop VZEROUPPER mulGFNI_4x7_64Xor_end: RET // func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88 // Loading 7 of 28 tables to registers // Destination kept in GP registers // Full registers estimated 37 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R8 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R8 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, DX mulAvxGFNI_4x7Xor_loop: // Load 7 outputs VMOVDQU (R9), Y7 VMOVDQU (R10), Y8 VMOVDQU (R11), Y9 VMOVDQU (R12), Y10 VMOVDQU (R13), Y11 VMOVDQU (R14), Y12 VMOVDQU (R8), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (R9) ADDQ $0x20, R9 VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x7Xor_loop VZEROUPPER mulAvxGFNI_4x7Xor_end: RET // func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R8 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R8 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X7 VPBROADCASTB X7, Y7 mulAvxTwo_4x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (R9), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (R8), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x7Xor_loop VZEROUPPER mulAvxTwo_4x7Xor_end: RET // func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 77 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 mulAvxTwo_4x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R15) ADDQ $0x20, R15 VMOVDQU Y7, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x8_loop VZEROUPPER mulAvxTwo_4x8_end: RET // func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64(SB), $8-88 // Loading 22 of 32 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, DX mulGFNI_4x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_4x8_64_loop VZEROUPPER mulGFNI_4x8_64_end: RET // func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x8(SB), $8-88 // Loading 6 of 32 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, DX mulAvxGFNI_4x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x8_loop VZEROUPPER mulAvxGFNI_4x8_end: RET // func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 // Loading 22 of 32 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, DX mulGFNI_4x8_64Xor_loop: // Load 8 outputs VMOVDQU64 (R9), Z22 VMOVDQU64 (R10), Z23 VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R8), Z29 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_4x8_64Xor_loop VZEROUPPER mulGFNI_4x8_64Xor_end: RET // func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88 // Loading 6 of 32 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, DX mulAvxGFNI_4x8Xor_loop: // Load 8 outputs VMOVDQU (R9), Y6 VMOVDQU (R10), Y7 VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R8), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_4x8Xor_loop VZEROUPPER mulAvxGFNI_4x8Xor_end: RET // func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 77 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 mulAvxTwo_4x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (R9), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R15), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (R8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R15) ADDQ $0x20, R15 VMOVDQU Y7, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x8Xor_loop VZEROUPPER mulAvxTwo_4x8Xor_end: RET // func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 86 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x9_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_4x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 ADDQ $0x20, AX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (R15) ADDQ $0x20, R15 VMOVDQU Y8, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ BP JNZ mulAvxTwo_4x9_loop VZEROUPPER mulAvxTwo_4x9_end: RET // func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64(SB), $8-88 // Loading 21 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_4x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs VMOVDQU64 Z21, (R8) ADDQ $0x40, R8 VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ BP JNZ mulGFNI_4x9_64_loop VZEROUPPER mulGFNI_4x9_64_end: RET // func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x9(SB), $8-88 // Loading 5 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_4x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs VMOVDQU Y5, (R8) ADDQ $0x20, R8 VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ BP JNZ mulAvxGFNI_4x9_loop VZEROUPPER mulAvxGFNI_4x9_end: RET // func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 // Loading 21 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_4x9_64Xor_loop: // Load 9 outputs VMOVDQU64 (R8), Z21 VMOVDQU64 (R9), Z22 VMOVDQU64 (R10), Z23 VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (DI), Z29 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs VMOVDQU64 Z21, (R8) ADDQ $0x40, R8 VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (DI) ADDQ $0x40, DI // Prepare for next loop DECQ BP JNZ mulGFNI_4x9_64Xor_loop VZEROUPPER mulGFNI_4x9_64Xor_end: RET // func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88 // Loading 5 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_4x9Xor_loop: // Load 9 outputs VMOVDQU (R8), Y5 VMOVDQU (R9), Y6 VMOVDQU (R10), Y7 VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (DI), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs VMOVDQU Y5, (R8) ADDQ $0x20, R8 VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ BP JNZ mulAvxGFNI_4x9Xor_loop VZEROUPPER mulAvxGFNI_4x9Xor_end: RET // func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 86 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x9Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_4x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (R8), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (R15), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (DI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 ADDQ $0x20, AX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (R15) ADDQ $0x20, R15 VMOVDQU Y8, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ BP JNZ mulAvxTwo_4x9Xor_loop VZEROUPPER mulAvxTwo_4x9Xor_end: RET // func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 95 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX MOVQ $0x0000000f, R10 MOVQ R10, X10 VPBROADCASTB X10, Y10 mulAvxTwo_4x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y0, (R10)(R9*1) MOVQ 24(R8), R10 VMOVDQU Y1, (R10)(R9*1) MOVQ 48(R8), R10 VMOVDQU Y2, (R10)(R9*1) MOVQ 72(R8), R10 VMOVDQU Y3, (R10)(R9*1) MOVQ 96(R8), R10 VMOVDQU Y4, (R10)(R9*1) MOVQ 120(R8), R10 VMOVDQU Y5, (R10)(R9*1) MOVQ 144(R8), R10 VMOVDQU Y6, (R10)(R9*1) MOVQ 168(R8), R10 VMOVDQU Y7, (R10)(R9*1) MOVQ 192(R8), R10 VMOVDQU Y8, (R10)(R9*1) MOVQ 216(R8), R10 VMOVDQU Y9, (R10)(R9*1) // Prepare for next loop ADDQ $0x20, R9 DECQ AX JNZ mulAvxTwo_4x10_loop VZEROUPPER mulAvxTwo_4x10_end: RET // func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x10_64(SB), $0-88 // Loading 20 of 40 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX mulGFNI_4x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R8), R10 VMOVDQU64 Z20, (R10)(R9*1) MOVQ 24(R8), R10 VMOVDQU64 Z21, (R10)(R9*1) MOVQ 48(R8), R10 VMOVDQU64 Z22, (R10)(R9*1) MOVQ 72(R8), R10 VMOVDQU64 Z23, (R10)(R9*1) MOVQ 96(R8), R10 VMOVDQU64 Z24, (R10)(R9*1) MOVQ 120(R8), R10 VMOVDQU64 Z25, (R10)(R9*1) MOVQ 144(R8), R10 VMOVDQU64 Z26, (R10)(R9*1) MOVQ 168(R8), R10 VMOVDQU64 Z27, (R10)(R9*1) MOVQ 192(R8), R10 VMOVDQU64 Z28, (R10)(R9*1) MOVQ 216(R8), R10 VMOVDQU64 Z29, (R10)(R9*1) // Prepare for next loop ADDQ $0x40, R9 DECQ AX JNZ mulGFNI_4x10_64_loop VZEROUPPER mulGFNI_4x10_64_end: RET // func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x10(SB), $0-88 // Loading 4 of 40 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX mulAvxGFNI_4x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y4, (R10)(R9*1) MOVQ 24(R8), R10 VMOVDQU Y5, (R10)(R9*1) MOVQ 48(R8), R10 VMOVDQU Y6, (R10)(R9*1) MOVQ 72(R8), R10 VMOVDQU Y7, (R10)(R9*1) MOVQ 96(R8), R10 VMOVDQU Y8, (R10)(R9*1) MOVQ 120(R8), R10 VMOVDQU Y9, (R10)(R9*1) MOVQ 144(R8), R10 VMOVDQU Y10, (R10)(R9*1) MOVQ 168(R8), R10 VMOVDQU Y11, (R10)(R9*1) MOVQ 192(R8), R10 VMOVDQU Y12, (R10)(R9*1) MOVQ 216(R8), R10 VMOVDQU Y13, (R10)(R9*1) // Prepare for next loop ADDQ $0x20, R9 DECQ AX JNZ mulAvxGFNI_4x10_loop VZEROUPPER mulAvxGFNI_4x10_end: RET // func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 // Loading 20 of 40 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_4x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX mulGFNI_4x10_64Xor_loop: // Load 10 outputs MOVQ (R8), R10 VMOVDQU64 (R10)(R9*1), Z20 MOVQ 24(R8), R10 VMOVDQU64 (R10)(R9*1), Z21 MOVQ 48(R8), R10 VMOVDQU64 (R10)(R9*1), Z22 MOVQ 72(R8), R10 VMOVDQU64 (R10)(R9*1), Z23 MOVQ 96(R8), R10 VMOVDQU64 (R10)(R9*1), Z24 MOVQ 120(R8), R10 VMOVDQU64 (R10)(R9*1), Z25 MOVQ 144(R8), R10 VMOVDQU64 (R10)(R9*1), Z26 MOVQ 168(R8), R10 VMOVDQU64 (R10)(R9*1), Z27 MOVQ 192(R8), R10 VMOVDQU64 (R10)(R9*1), Z28 MOVQ 216(R8), R10 VMOVDQU64 (R10)(R9*1), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R8), R10 VMOVDQU64 Z20, (R10)(R9*1) MOVQ 24(R8), R10 VMOVDQU64 Z21, (R10)(R9*1) MOVQ 48(R8), R10 VMOVDQU64 Z22, (R10)(R9*1) MOVQ 72(R8), R10 VMOVDQU64 Z23, (R10)(R9*1) MOVQ 96(R8), R10 VMOVDQU64 Z24, (R10)(R9*1) MOVQ 120(R8), R10 VMOVDQU64 Z25, (R10)(R9*1) MOVQ 144(R8), R10 VMOVDQU64 Z26, (R10)(R9*1) MOVQ 168(R8), R10 VMOVDQU64 Z27, (R10)(R9*1) MOVQ 192(R8), R10 VMOVDQU64 Z28, (R10)(R9*1) MOVQ 216(R8), R10 VMOVDQU64 Z29, (R10)(R9*1) // Prepare for next loop ADDQ $0x40, R9 DECQ AX JNZ mulGFNI_4x10_64Xor_loop VZEROUPPER mulGFNI_4x10_64Xor_end: RET // func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88 // Loading 4 of 40 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_4x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX mulAvxGFNI_4x10Xor_loop: // Load 10 outputs MOVQ (R8), R10 VMOVDQU (R10)(R9*1), Y4 MOVQ 24(R8), R10 VMOVDQU (R10)(R9*1), Y5 MOVQ 48(R8), R10 VMOVDQU (R10)(R9*1), Y6 MOVQ 72(R8), R10 VMOVDQU (R10)(R9*1), Y7 MOVQ 96(R8), R10 VMOVDQU (R10)(R9*1), Y8 MOVQ 120(R8), R10 VMOVDQU (R10)(R9*1), Y9 MOVQ 144(R8), R10 VMOVDQU (R10)(R9*1), Y10 MOVQ 168(R8), R10 VMOVDQU (R10)(R9*1), Y11 MOVQ 192(R8), R10 VMOVDQU (R10)(R9*1), Y12 MOVQ 216(R8), R10 VMOVDQU (R10)(R9*1), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y4, (R10)(R9*1) MOVQ 24(R8), R10 VMOVDQU Y5, (R10)(R9*1) MOVQ 48(R8), R10 VMOVDQU Y6, (R10)(R9*1) MOVQ 72(R8), R10 VMOVDQU Y7, (R10)(R9*1) MOVQ 96(R8), R10 VMOVDQU Y8, (R10)(R9*1) MOVQ 120(R8), R10 VMOVDQU Y9, (R10)(R9*1) MOVQ 144(R8), R10 VMOVDQU Y10, (R10)(R9*1) MOVQ 168(R8), R10 VMOVDQU Y11, (R10)(R9*1) MOVQ 192(R8), R10 VMOVDQU Y12, (R10)(R9*1) MOVQ 216(R8), R10 VMOVDQU Y13, (R10)(R9*1) // Prepare for next loop ADDQ $0x20, R9 DECQ AX JNZ mulAvxGFNI_4x10Xor_loop VZEROUPPER mulAvxGFNI_4x10Xor_end: RET // func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 95 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX MOVQ $0x0000000f, R10 MOVQ R10, X10 VPBROADCASTB X10, Y10 mulAvxTwo_4x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R8), R10 VMOVDQU (R10)(R9*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R8), R10 VMOVDQU (R10)(R9*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R8), R10 VMOVDQU (R10)(R9*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R8), R10 VMOVDQU (R10)(R9*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R8), R10 VMOVDQU (R10)(R9*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R8), R10 VMOVDQU (R10)(R9*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R8), R10 VMOVDQU (R10)(R9*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R8), R10 VMOVDQU (R10)(R9*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R8), R10 VMOVDQU (R10)(R9*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R8), R10 VMOVDQU (R10)(R9*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y0, (R10)(R9*1) MOVQ 24(R8), R10 VMOVDQU Y1, (R10)(R9*1) MOVQ 48(R8), R10 VMOVDQU Y2, (R10)(R9*1) MOVQ 72(R8), R10 VMOVDQU Y3, (R10)(R9*1) MOVQ 96(R8), R10 VMOVDQU Y4, (R10)(R9*1) MOVQ 120(R8), R10 VMOVDQU Y5, (R10)(R9*1) MOVQ 144(R8), R10 VMOVDQU Y6, (R10)(R9*1) MOVQ 168(R8), R10 VMOVDQU Y7, (R10)(R9*1) MOVQ 192(R8), R10 VMOVDQU Y8, (R10)(R9*1) MOVQ 216(R8), R10 VMOVDQU Y9, (R10)(R9*1) // Prepare for next loop ADDQ $0x20, R9 DECQ AX JNZ mulAvxTwo_4x10Xor_loop VZEROUPPER mulAvxTwo_4x10Xor_end: RET // func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x1_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X2 VPBROADCASTB X2, Y2 mulAvxTwo_5x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y0 VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x1_64_loop VZEROUPPER mulAvxTwo_5x1_64_end: RET // func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, CX mulGFNI_5x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z6 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z6, Z5 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z6 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 VXORPD Z5, Z6, Z5 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z6 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 VXORPD Z5, Z6, Z5 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z6 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 VXORPD Z5, Z6, Z5 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (CX), Z6 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 VXORPD Z5, Z6, Z5 // Store 1 outputs VMOVDQU64 Z5, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x1_64_loop VZEROUPPER mulGFNI_5x1_64_end: RET // func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, CX mulAvxGFNI_5x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y6 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 VXORPD Y5, Y6, Y5 // Store 1 outputs VMOVDQU Y5, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x1_loop VZEROUPPER mulAvxGFNI_5x1_end: RET // func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, CX mulGFNI_5x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (R8), Z5 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z6 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z6, Z6 VXORPD Z5, Z6, Z5 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z6 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 VXORPD Z5, Z6, Z5 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z6 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 VXORPD Z5, Z6, Z5 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z6 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 VXORPD Z5, Z6, Z5 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (CX), Z6 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 VXORPD Z5, Z6, Z5 // Store 1 outputs VMOVDQU64 Z5, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x1_64Xor_loop VZEROUPPER mulGFNI_5x1_64Xor_end: RET // func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, CX mulAvxGFNI_5x1Xor_loop: // Load 1 outputs VMOVDQU (R8), Y5 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y6 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 VXORPD Y5, Y6, Y5 // Store 1 outputs VMOVDQU Y5, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x1Xor_loop VZEROUPPER mulAvxGFNI_5x1Xor_end: RET // func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X2 VPBROADCASTB X2, Y2 mulAvxTwo_5x1_64Xor_loop: // Load 1 outputs VMOVDQU (R9), Y0 VMOVDQU 32(R9), Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x1_64Xor_loop VZEROUPPER mulAvxTwo_5x1_64Xor_end: RET // func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 49 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 ADDQ R11, R9 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X4 VPBROADCASTB X4, Y4 mulAvxTwo_5x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) ADDQ $0x40, R10 VMOVDQU Y2, (R9) VMOVDQU Y3, 32(R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x2_64_loop VZEROUPPER mulAvxTwo_5x2_64_end: RET // func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 ADDQ R10, R8 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, CX mulGFNI_5x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z12 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z12, Z10 VGF2P8AFFINEQB $0x00, Z1, Z12, Z11 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z12 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z12 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z12 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (CX), Z12 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 VXORPD Z11, Z13, Z11 // Store 2 outputs VMOVDQU64 Z10, (R9) ADDQ $0x40, R9 VMOVDQU64 Z11, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x2_64_loop VZEROUPPER mulGFNI_5x2_64_end: RET // func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x2(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 ADDQ R10, R8 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, CX mulAvxGFNI_5x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 VXORPD Y11, Y13, Y11 // Store 2 outputs VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x2_loop VZEROUPPER mulAvxGFNI_5x2_end: RET // func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 ADDQ R10, R8 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, CX mulGFNI_5x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (R9), Z10 VMOVDQU64 (R8), Z11 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z12 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z12 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z12 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z12 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 VXORPD Z11, Z13, Z11 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (CX), Z12 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 VXORPD Z10, Z13, Z10 VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 VXORPD Z11, Z13, Z11 // Store 2 outputs VMOVDQU64 Z10, (R9) ADDQ $0x40, R9 VMOVDQU64 Z11, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x2_64Xor_loop VZEROUPPER mulGFNI_5x2_64Xor_end: RET // func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 ADDQ R10, R8 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, CX mulAvxGFNI_5x2Xor_loop: // Load 2 outputs VMOVDQU (R9), Y10 VMOVDQU (R8), Y11 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 VXORPD Y10, Y13, Y10 VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 VXORPD Y11, Y13, Y11 // Store 2 outputs VMOVDQU Y10, (R9) ADDQ $0x20, R9 VMOVDQU Y11, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x2Xor_loop VZEROUPPER mulAvxGFNI_5x2Xor_end: RET // func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 49 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 ADDQ R11, R9 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X4 VPBROADCASTB X4, Y4 mulAvxTwo_5x2_64Xor_loop: // Load 2 outputs VMOVDQU (R10), Y0 VMOVDQU 32(R10), Y1 VMOVDQU (R9), Y2 VMOVDQU 32(R9), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) ADDQ $0x40, R10 VMOVDQU Y2, (R9) VMOVDQU Y3, 32(R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x2_64Xor_loop VZEROUPPER mulAvxTwo_5x2_64Xor_end: RET // func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R9 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X6 VPBROADCASTB X6, Y6 mulAvxTwo_5x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) ADDQ $0x40, R10 VMOVDQU Y2, (R11) VMOVDQU Y3, 32(R11) ADDQ $0x40, R11 VMOVDQU Y4, (R9) VMOVDQU Y5, 32(R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x3_64_loop VZEROUPPER mulAvxTwo_5x3_64_end: RET // func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, R8 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, CX mulGFNI_5x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z18 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z18, Z15 VGF2P8AFFINEQB $0x00, Z1, Z18, Z16 VGF2P8AFFINEQB $0x00, Z2, Z18, Z17 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z18 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z18 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z18 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (CX), Z18 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 VXORPD Z17, Z19, Z17 // Store 3 outputs VMOVDQU64 Z15, (R9) ADDQ $0x40, R9 VMOVDQU64 Z16, (R10) ADDQ $0x40, R10 VMOVDQU64 Z17, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x3_64_loop VZEROUPPER mulGFNI_5x3_64_end: RET // func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x3(SB), $0-88 // Loading 11 of 15 tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R9 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, DX mulAvxGFNI_5x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R10) ADDQ $0x20, R10 VMOVDQU Y12, (R11) ADDQ $0x20, R11 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x3_loop VZEROUPPER mulAvxGFNI_5x3_end: RET // func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, R8 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, CX mulGFNI_5x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (R9), Z15 VMOVDQU64 (R10), Z16 VMOVDQU64 (R8), Z17 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z18 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z18 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z18 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z18 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (CX), Z18 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 VXORPD Z15, Z19, Z15 VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 VXORPD Z17, Z19, Z17 // Store 3 outputs VMOVDQU64 Z15, (R9) ADDQ $0x40, R9 VMOVDQU64 Z16, (R10) ADDQ $0x40, R10 VMOVDQU64 Z17, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x3_64Xor_loop VZEROUPPER mulGFNI_5x3_64Xor_end: RET // func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88 // Loading 11 of 15 tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R9 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, DX mulAvxGFNI_5x3Xor_loop: // Load 3 outputs VMOVDQU (R10), Y11 VMOVDQU (R11), Y12 VMOVDQU (R9), Y13 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R10) ADDQ $0x20, R10 VMOVDQU Y12, (R11) ADDQ $0x20, R11 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x3Xor_loop VZEROUPPER mulAvxGFNI_5x3Xor_end: RET // func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R9 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X6 VPBROADCASTB X6, Y6 mulAvxTwo_5x3_64Xor_loop: // Load 3 outputs VMOVDQU (R10), Y0 VMOVDQU 32(R10), Y1 VMOVDQU (R11), Y2 VMOVDQU 32(R11), Y3 VMOVDQU (R9), Y4 VMOVDQU 32(R9), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) ADDQ $0x40, R10 VMOVDQU Y2, (R11) VMOVDQU Y3, 32(R11) ADDQ $0x40, R11 VMOVDQU Y4, (R9) VMOVDQU Y5, 32(R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x3_64Xor_loop VZEROUPPER mulAvxTwo_5x3_64Xor_end: RET // func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 49 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R9 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R9 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X4 VPBROADCASTB X4, Y4 mulAvxTwo_5x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x4_loop VZEROUPPER mulAvxTwo_5x4_end: RET // func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R8 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R8 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, CX mulGFNI_5x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z24 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z24, Z20 VGF2P8AFFINEQB $0x00, Z1, Z24, Z21 VGF2P8AFFINEQB $0x00, Z2, Z24, Z22 VGF2P8AFFINEQB $0x00, Z3, Z24, Z23 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z24 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (SI), Z24 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (DI), Z24 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (CX), Z24 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 VXORPD Z23, Z25, Z23 // Store 4 outputs VMOVDQU64 Z20, (R9) ADDQ $0x40, R9 VMOVDQU64 Z21, (R10) ADDQ $0x40, R10 VMOVDQU64 Z22, (R11) ADDQ $0x40, R11 VMOVDQU64 Z23, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x4_64_loop VZEROUPPER mulGFNI_5x4_64_end: RET // func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x4(SB), $0-88 // Loading 10 of 20 tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R9 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R9 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, DX mulAvxGFNI_5x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x4_loop VZEROUPPER mulAvxGFNI_5x4_end: RET // func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R8 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R8 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, CX mulGFNI_5x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (R9), Z20 VMOVDQU64 (R10), Z21 VMOVDQU64 (R11), Z22 VMOVDQU64 (R8), Z23 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z24 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z24 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (SI), Z24 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (DI), Z24 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (CX), Z24 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 VXORPD Z20, Z25, Z20 VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 VXORPD Z23, Z25, Z23 // Store 4 outputs VMOVDQU64 Z20, (R9) ADDQ $0x40, R9 VMOVDQU64 Z21, (R10) ADDQ $0x40, R10 VMOVDQU64 Z22, (R11) ADDQ $0x40, R11 VMOVDQU64 Z23, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x4_64Xor_loop VZEROUPPER mulGFNI_5x4_64Xor_end: RET // func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88 // Loading 10 of 20 tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R9 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R9 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, DX mulAvxGFNI_5x4Xor_loop: // Load 4 outputs VMOVDQU (R10), Y10 VMOVDQU (R11), Y11 VMOVDQU (R12), Y12 VMOVDQU (R9), Y13 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R10) ADDQ $0x20, R10 VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x4Xor_loop VZEROUPPER mulAvxGFNI_5x4Xor_end: RET // func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 49 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R9 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R9 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X4 VPBROADCASTB X4, Y4 mulAvxTwo_5x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (R10), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x4Xor_loop VZEROUPPER mulAvxTwo_5x4Xor_end: RET // func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 60 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R9 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R9 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X5 VPBROADCASTB X5, Y5 mulAvxTwo_5x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x5_loop VZEROUPPER mulAvxTwo_5x5_end: RET // func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R8 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R8 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, CX mulGFNI_5x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (CX), Z30 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs VMOVDQU64 Z25, (R9) ADDQ $0x40, R9 VMOVDQU64 Z26, (R10) ADDQ $0x40, R10 VMOVDQU64 Z27, (R11) ADDQ $0x40, R11 VMOVDQU64 Z28, (R12) ADDQ $0x40, R12 VMOVDQU64 Z29, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x5_64_loop VZEROUPPER mulGFNI_5x5_64_end: RET // func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x5(SB), $0-88 // Loading 9 of 25 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R9 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R9 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, DX mulAvxGFNI_5x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x5_loop VZEROUPPER mulAvxGFNI_5x5_end: RET // func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R8 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R8 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, CX mulGFNI_5x5_64Xor_loop: // Load 5 outputs VMOVDQU64 (R9), Z25 VMOVDQU64 (R10), Z26 VMOVDQU64 (R11), Z27 VMOVDQU64 (R12), Z28 VMOVDQU64 (R8), Z29 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (CX), Z30 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs VMOVDQU64 Z25, (R9) ADDQ $0x40, R9 VMOVDQU64 Z26, (R10) ADDQ $0x40, R10 VMOVDQU64 Z27, (R11) ADDQ $0x40, R11 VMOVDQU64 Z28, (R12) ADDQ $0x40, R12 VMOVDQU64 Z29, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ AX JNZ mulGFNI_5x5_64Xor_loop VZEROUPPER mulGFNI_5x5_64Xor_end: RET // func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88 // Loading 9 of 25 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R9 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R9 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, DX mulAvxGFNI_5x5Xor_loop: // Load 5 outputs VMOVDQU (R10), Y9 VMOVDQU (R11), Y10 VMOVDQU (R12), Y11 VMOVDQU (R13), Y12 VMOVDQU (R9), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R10) ADDQ $0x20, R10 VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x5Xor_loop VZEROUPPER mulAvxGFNI_5x5Xor_end: RET // func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 60 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R9 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R9 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X5 VPBROADCASTB X5, Y5 mulAvxTwo_5x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (R10), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R9), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x5Xor_loop VZEROUPPER mulAvxTwo_5x5Xor_end: RET // func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 71 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R9 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R9 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 mulAvxTwo_5x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x6_loop VZEROUPPER mulAvxTwo_5x6_end: RET // func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64(SB), $0-88 // Loading 24 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R9 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R9 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, DX mulGFNI_5x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_5x6_64_loop VZEROUPPER mulGFNI_5x6_64_end: RET // func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x6(SB), $0-88 // Loading 8 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R9 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R9 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, DX mulAvxGFNI_5x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x6_loop VZEROUPPER mulAvxGFNI_5x6_end: RET // func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 // Loading 24 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R9 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R9 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, DX mulGFNI_5x6_64Xor_loop: // Load 6 outputs VMOVDQU64 (R10), Z24 VMOVDQU64 (R11), Z25 VMOVDQU64 (R12), Z26 VMOVDQU64 (R13), Z27 VMOVDQU64 (R14), Z28 VMOVDQU64 (R9), Z29 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_5x6_64Xor_loop VZEROUPPER mulGFNI_5x6_64Xor_end: RET // func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88 // Loading 8 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R9 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R9 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, DX mulAvxGFNI_5x6Xor_loop: // Load 6 outputs VMOVDQU (R10), Y8 VMOVDQU (R11), Y9 VMOVDQU (R12), Y10 VMOVDQU (R13), Y11 VMOVDQU (R14), Y12 VMOVDQU (R9), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R10) ADDQ $0x20, R10 VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x6Xor_loop VZEROUPPER mulAvxGFNI_5x6Xor_end: RET // func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 71 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R9 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R9 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 mulAvxTwo_5x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (R10), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (R9), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x6Xor_loop VZEROUPPER mulAvxTwo_5x6Xor_end: RET // func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 mulAvxTwo_5x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R15) ADDQ $0x20, R15 VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x7_loop VZEROUPPER mulAvxTwo_5x7_end: RET // func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64(SB), $8-88 // Loading 23 of 35 tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, DX mulGFNI_5x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_5x7_64_loop VZEROUPPER mulGFNI_5x7_64_end: RET // func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x7(SB), $8-88 // Loading 7 of 35 tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, DX mulAvxGFNI_5x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x7_loop VZEROUPPER mulAvxGFNI_5x7_end: RET // func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 // Loading 23 of 35 tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, DX mulGFNI_5x7_64Xor_loop: // Load 7 outputs VMOVDQU64 (R10), Z23 VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R9), Z29 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_5x7_64Xor_loop VZEROUPPER mulGFNI_5x7_64Xor_end: RET // func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88 // Loading 7 of 35 tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, DX mulAvxGFNI_5x7Xor_loop: // Load 7 outputs VMOVDQU (R10), Y7 VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R9), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_5x7Xor_loop VZEROUPPER mulAvxGFNI_5x7Xor_end: RET // func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 mulAvxTwo_5x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (R10), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R15), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (R9), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R15) ADDQ $0x20, R15 VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x7Xor_loop VZEROUPPER mulAvxTwo_5x7Xor_end: RET // func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 93 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x8_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_5x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 ADDQ $0x20, AX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R15) ADDQ $0x20, R15 VMOVDQU Y7, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ BP JNZ mulAvxTwo_5x8_loop VZEROUPPER mulAvxTwo_5x8_end: RET // func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64(SB), $8-88 // Loading 22 of 40 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_5x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ BP JNZ mulGFNI_5x8_64_loop VZEROUPPER mulGFNI_5x8_64_end: RET // func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x8(SB), $8-88 // Loading 6 of 40 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_5x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_5x8_loop VZEROUPPER mulAvxGFNI_5x8_end: RET // func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 // Loading 22 of 40 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_5x8_64Xor_loop: // Load 8 outputs VMOVDQU64 (R9), Z22 VMOVDQU64 (R10), Z23 VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R8), Z29 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs VMOVDQU64 Z22, (R9) ADDQ $0x40, R9 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R8) ADDQ $0x40, R8 // Prepare for next loop DECQ BP JNZ mulGFNI_5x8_64Xor_loop VZEROUPPER mulGFNI_5x8_64Xor_end: RET // func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88 // Loading 6 of 40 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_5x8Xor_loop: // Load 8 outputs VMOVDQU (R9), Y6 VMOVDQU (R10), Y7 VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R8), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs VMOVDQU Y6, (R9) ADDQ $0x20, R9 VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_5x8Xor_loop VZEROUPPER mulAvxGFNI_5x8Xor_end: RET // func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 93 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x8Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_5x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (R9), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R15), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (R8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 ADDQ $0x20, AX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R15) ADDQ $0x20, R15 VMOVDQU Y7, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ BP JNZ mulAvxTwo_5x8Xor_loop VZEROUPPER mulAvxTwo_5x8Xor_end: RET // func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 104 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R11 MOVQ R11, X9 VPBROADCASTB X9, Y9 mulAvxTwo_5x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y1, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y2, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y3, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y4, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y8, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxTwo_5x9_loop VZEROUPPER mulAvxTwo_5x9_end: RET // func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64(SB), $0-88 // Loading 21 of 45 tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX mulGFNI_5x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R9), R11 VMOVDQU64 Z21, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU64 Z22, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU64 Z23, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU64 Z24, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU64 Z25, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU64 Z26, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU64 Z27, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU64 Z28, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU64 Z29, (R11)(R10*1) // Prepare for next loop ADDQ $0x40, R10 DECQ AX JNZ mulGFNI_5x9_64_loop VZEROUPPER mulGFNI_5x9_64_end: RET // func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x9(SB), $0-88 // Loading 5 of 45 tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX mulAvxGFNI_5x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y8, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y9, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y10, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y11, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y12, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y13, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxGFNI_5x9_loop VZEROUPPER mulAvxGFNI_5x9_end: RET // func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 // Loading 21 of 45 tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX mulGFNI_5x9_64Xor_loop: // Load 9 outputs MOVQ (R9), R11 VMOVDQU64 (R11)(R10*1), Z21 MOVQ 24(R9), R11 VMOVDQU64 (R11)(R10*1), Z22 MOVQ 48(R9), R11 VMOVDQU64 (R11)(R10*1), Z23 MOVQ 72(R9), R11 VMOVDQU64 (R11)(R10*1), Z24 MOVQ 96(R9), R11 VMOVDQU64 (R11)(R10*1), Z25 MOVQ 120(R9), R11 VMOVDQU64 (R11)(R10*1), Z26 MOVQ 144(R9), R11 VMOVDQU64 (R11)(R10*1), Z27 MOVQ 168(R9), R11 VMOVDQU64 (R11)(R10*1), Z28 MOVQ 192(R9), R11 VMOVDQU64 (R11)(R10*1), Z29 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R9), R11 VMOVDQU64 Z21, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU64 Z22, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU64 Z23, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU64 Z24, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU64 Z25, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU64 Z26, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU64 Z27, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU64 Z28, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU64 Z29, (R11)(R10*1) // Prepare for next loop ADDQ $0x40, R10 DECQ AX JNZ mulGFNI_5x9_64Xor_loop VZEROUPPER mulGFNI_5x9_64Xor_end: RET // func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88 // Loading 5 of 45 tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX mulAvxGFNI_5x9Xor_loop: // Load 9 outputs MOVQ (R9), R11 VMOVDQU (R11)(R10*1), Y5 MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y6 MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y7 MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y8 MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y9 MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y10 MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y11 MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y12 MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y8, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y9, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y10, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y11, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y12, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y13, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxGFNI_5x9Xor_loop VZEROUPPER mulAvxGFNI_5x9Xor_end: RET // func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 104 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R11 MOVQ R11, X9 VPBROADCASTB X9, Y9 mulAvxTwo_5x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R9), R11 VMOVDQU (R11)(R10*1), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y1, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y2, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y3, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y4, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y8, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxTwo_5x9Xor_loop VZEROUPPER mulAvxTwo_5x9Xor_end: RET // func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 115 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R11 MOVQ R11, X10 VPBROADCASTB X10, Y10 mulAvxTwo_5x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y1, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y2, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y3, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y4, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y8, (R11)(R10*1) MOVQ 216(R9), R11 VMOVDQU Y9, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxTwo_5x10_loop VZEROUPPER mulAvxTwo_5x10_end: RET // func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64(SB), $0-88 // Loading 20 of 50 tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX mulGFNI_5x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R9), R11 VMOVDQU64 Z20, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU64 Z21, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU64 Z22, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU64 Z23, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU64 Z24, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU64 Z25, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU64 Z26, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU64 Z27, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU64 Z28, (R11)(R10*1) MOVQ 216(R9), R11 VMOVDQU64 Z29, (R11)(R10*1) // Prepare for next loop ADDQ $0x40, R10 DECQ AX JNZ mulGFNI_5x10_64_loop VZEROUPPER mulGFNI_5x10_64_end: RET // func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x10(SB), $0-88 // Loading 4 of 50 tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX mulAvxGFNI_5x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y4, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y8, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y9, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y10, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y11, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y12, (R11)(R10*1) MOVQ 216(R9), R11 VMOVDQU Y13, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxGFNI_5x10_loop VZEROUPPER mulAvxGFNI_5x10_end: RET // func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 // Loading 20 of 50 tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_5x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX mulGFNI_5x10_64Xor_loop: // Load 10 outputs MOVQ (R9), R11 VMOVDQU64 (R11)(R10*1), Z20 MOVQ 24(R9), R11 VMOVDQU64 (R11)(R10*1), Z21 MOVQ 48(R9), R11 VMOVDQU64 (R11)(R10*1), Z22 MOVQ 72(R9), R11 VMOVDQU64 (R11)(R10*1), Z23 MOVQ 96(R9), R11 VMOVDQU64 (R11)(R10*1), Z24 MOVQ 120(R9), R11 VMOVDQU64 (R11)(R10*1), Z25 MOVQ 144(R9), R11 VMOVDQU64 (R11)(R10*1), Z26 MOVQ 168(R9), R11 VMOVDQU64 (R11)(R10*1), Z27 MOVQ 192(R9), R11 VMOVDQU64 (R11)(R10*1), Z28 MOVQ 216(R9), R11 VMOVDQU64 (R11)(R10*1), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R9), R11 VMOVDQU64 Z20, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU64 Z21, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU64 Z22, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU64 Z23, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU64 Z24, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU64 Z25, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU64 Z26, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU64 Z27, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU64 Z28, (R11)(R10*1) MOVQ 216(R9), R11 VMOVDQU64 Z29, (R11)(R10*1) // Prepare for next loop ADDQ $0x40, R10 DECQ AX JNZ mulGFNI_5x10_64Xor_loop VZEROUPPER mulGFNI_5x10_64Xor_end: RET // func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88 // Loading 4 of 50 tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_5x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX mulAvxGFNI_5x10Xor_loop: // Load 10 outputs MOVQ (R9), R11 VMOVDQU (R11)(R10*1), Y4 MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y5 MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y6 MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y7 MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y8 MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y9 MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y10 MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y11 MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y12 MOVQ 216(R9), R11 VMOVDQU (R11)(R10*1), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y4, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y8, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y9, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y10, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y11, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y12, (R11)(R10*1) MOVQ 216(R9), R11 VMOVDQU Y13, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxGFNI_5x10Xor_loop VZEROUPPER mulAvxGFNI_5x10Xor_end: RET // func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 115 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R11 MOVQ R11, X10 VPBROADCASTB X10, Y10 mulAvxTwo_5x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R9), R11 VMOVDQU (R11)(R10*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R9), R11 VMOVDQU (R11)(R10*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y1, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y2, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y3, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y4, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y8, (R11)(R10*1) MOVQ 216(R9), R11 VMOVDQU Y9, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxTwo_5x10Xor_loop VZEROUPPER mulAvxTwo_5x10Xor_end: RET // func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x1_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X2 VPBROADCASTB X2, Y2 mulAvxTwo_6x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y0 VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x1_64_loop VZEROUPPER mulAvxTwo_6x1_64_end: RET // func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 9 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, CX mulGFNI_6x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z7 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z7, Z6 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z7 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z7 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z7 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z7 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (CX), Z7 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 VXORPD Z6, Z7, Z6 // Store 1 outputs VMOVDQU64 Z6, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_6x1_64_loop VZEROUPPER mulGFNI_6x1_64_end: RET // func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 9 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, CX mulAvxGFNI_6x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y7 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 VXORPD Y6, Y7, Y6 // Store 1 outputs VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x1_loop VZEROUPPER mulAvxGFNI_6x1_end: RET // func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 9 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, CX mulGFNI_6x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (R9), Z6 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z7 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z7 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z7 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z7 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z7 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 VXORPD Z6, Z7, Z6 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (CX), Z7 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 VXORPD Z6, Z7, Z6 // Store 1 outputs VMOVDQU64 Z6, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_6x1_64Xor_loop VZEROUPPER mulGFNI_6x1_64Xor_end: RET // func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 9 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, CX mulAvxGFNI_6x1Xor_loop: // Load 1 outputs VMOVDQU (R9), Y6 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y7 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 VXORPD Y6, Y7, Y6 // Store 1 outputs VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x1Xor_loop VZEROUPPER mulAvxGFNI_6x1Xor_end: RET // func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X2 VPBROADCASTB X2, Y2 mulAvxTwo_6x1_64Xor_loop: // Load 1 outputs VMOVDQU (R10), Y0 VMOVDQU 32(R10), Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x1_64Xor_loop VZEROUPPER mulAvxTwo_6x1_64Xor_end: RET // func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 ADDQ R12, R10 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X4 VPBROADCASTB X4, Y4 mulAvxTwo_6x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) ADDQ $0x40, R11 VMOVDQU Y2, (R10) VMOVDQU Y3, 32(R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x2_64_loop VZEROUPPER mulAvxTwo_6x2_64_end: RET // func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 ADDQ R11, R9 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, CX mulGFNI_6x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z14 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z14, Z12 VGF2P8AFFINEQB $0x00, Z1, Z14, Z13 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z14 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z14 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z14 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z14 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (CX), Z14 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 VXORPD Z13, Z15, Z13 // Store 2 outputs VMOVDQU64 Z12, (R10) ADDQ $0x40, R10 VMOVDQU64 Z13, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_6x2_64_loop VZEROUPPER mulGFNI_6x2_64_end: RET // func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x2(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 ADDQ R11, R9 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, CX mulAvxGFNI_6x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (CX), Y14 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x2_loop VZEROUPPER mulAvxGFNI_6x2_end: RET // func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 ADDQ R11, R9 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, CX mulGFNI_6x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (R10), Z12 VMOVDQU64 (R9), Z13 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z14 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z14 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z14 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z14 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z14 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 VXORPD Z13, Z15, Z13 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (CX), Z14 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 VXORPD Z12, Z15, Z12 VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 VXORPD Z13, Z15, Z13 // Store 2 outputs VMOVDQU64 Z12, (R10) ADDQ $0x40, R10 VMOVDQU64 Z13, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_6x2_64Xor_loop VZEROUPPER mulGFNI_6x2_64Xor_end: RET // func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 ADDQ R11, R9 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, CX mulAvxGFNI_6x2Xor_loop: // Load 2 outputs VMOVDQU (R10), Y12 VMOVDQU (R9), Y13 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (CX), Y14 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R10) ADDQ $0x20, R10 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x2Xor_loop VZEROUPPER mulAvxGFNI_6x2Xor_end: RET // func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 ADDQ R12, R10 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X4 VPBROADCASTB X4, Y4 mulAvxTwo_6x2_64Xor_loop: // Load 2 outputs VMOVDQU (R11), Y0 VMOVDQU 32(R11), Y1 VMOVDQU (R10), Y2 VMOVDQU 32(R10), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) ADDQ $0x40, R11 VMOVDQU Y2, (R10) VMOVDQU Y3, 32(R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x2_64Xor_loop VZEROUPPER mulAvxTwo_6x2_64Xor_end: RET // func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R10 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X6 VPBROADCASTB X6, Y6 mulAvxTwo_6x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) ADDQ $0x40, R11 VMOVDQU Y2, (R12) VMOVDQU Y3, 32(R12) ADDQ $0x40, R12 VMOVDQU Y4, (R10) VMOVDQU Y5, 32(R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x3_64_loop VZEROUPPER mulAvxTwo_6x3_64_end: RET // func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R9 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, CX mulGFNI_6x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z21 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z21, Z18 VGF2P8AFFINEQB $0x00, Z1, Z21, Z19 VGF2P8AFFINEQB $0x00, Z2, Z21, Z20 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z21 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z21 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z21 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z21 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (CX), Z21 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 VXORPD Z20, Z22, Z20 // Store 3 outputs VMOVDQU64 Z18, (R10) ADDQ $0x40, R10 VMOVDQU64 Z19, (R11) ADDQ $0x40, R11 VMOVDQU64 Z20, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_6x3_64_loop VZEROUPPER mulGFNI_6x3_64_end: RET // func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x3(SB), $0-88 // Loading 11 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R10 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, DX mulAvxGFNI_6x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x3_loop VZEROUPPER mulAvxGFNI_6x3_end: RET // func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R9 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, CX mulGFNI_6x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (R10), Z18 VMOVDQU64 (R11), Z19 VMOVDQU64 (R9), Z20 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z21 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z21 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z21 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z21 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z21 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 VXORPD Z20, Z22, Z20 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (CX), Z21 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 VXORPD Z18, Z22, Z18 VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 VXORPD Z19, Z22, Z19 VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 VXORPD Z20, Z22, Z20 // Store 3 outputs VMOVDQU64 Z18, (R10) ADDQ $0x40, R10 VMOVDQU64 Z19, (R11) ADDQ $0x40, R11 VMOVDQU64 Z20, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_6x3_64Xor_loop VZEROUPPER mulGFNI_6x3_64Xor_end: RET // func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88 // Loading 11 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R10 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, DX mulAvxGFNI_6x3Xor_loop: // Load 3 outputs VMOVDQU (R11), Y11 VMOVDQU (R12), Y12 VMOVDQU (R10), Y13 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R11) ADDQ $0x20, R11 VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x3Xor_loop VZEROUPPER mulAvxGFNI_6x3Xor_end: RET // func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R10 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X6 VPBROADCASTB X6, Y6 mulAvxTwo_6x3_64Xor_loop: // Load 3 outputs VMOVDQU (R11), Y0 VMOVDQU 32(R11), Y1 VMOVDQU (R12), Y2 VMOVDQU 32(R12), Y3 VMOVDQU (R10), Y4 VMOVDQU 32(R10), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) ADDQ $0x40, R11 VMOVDQU Y2, (R12) VMOVDQU Y3, 32(R12) ADDQ $0x40, R12 VMOVDQU Y4, (R10) VMOVDQU Y5, 32(R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x3_64Xor_loop VZEROUPPER mulAvxTwo_6x3_64Xor_end: RET // func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R10 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R10 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X4 VPBROADCASTB X4, Y4 mulAvxTwo_6x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x4_loop VZEROUPPER mulAvxTwo_6x4_end: RET // func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R9 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R9 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, CX mulGFNI_6x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z28 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z28, Z24 VGF2P8AFFINEQB $0x00, Z1, Z28, Z25 VGF2P8AFFINEQB $0x00, Z2, Z28, Z26 VGF2P8AFFINEQB $0x00, Z3, Z28, Z27 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z28 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (SI), Z28 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (DI), Z28 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R8), Z28 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (CX), Z28 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 VXORPD Z27, Z29, Z27 // Store 4 outputs VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_6x4_64_loop VZEROUPPER mulGFNI_6x4_64_end: RET // func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x4(SB), $0-88 // Loading 10 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R10 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R10 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, DX mulAvxGFNI_6x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x4_loop VZEROUPPER mulAvxGFNI_6x4_end: RET // func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R9 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R9 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, CX mulGFNI_6x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (R10), Z24 VMOVDQU64 (R11), Z25 VMOVDQU64 (R12), Z26 VMOVDQU64 (R9), Z27 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z28 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z28 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (SI), Z28 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (DI), Z28 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R8), Z28 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 VXORPD Z27, Z29, Z27 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (CX), Z28 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 VXORPD Z24, Z29, Z24 VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 VXORPD Z25, Z29, Z25 VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 VXORPD Z26, Z29, Z26 VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 VXORPD Z27, Z29, Z27 // Store 4 outputs VMOVDQU64 Z24, (R10) ADDQ $0x40, R10 VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ AX JNZ mulGFNI_6x4_64Xor_loop VZEROUPPER mulGFNI_6x4_64Xor_end: RET // func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88 // Loading 10 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R10 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R10 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, DX mulAvxGFNI_6x4Xor_loop: // Load 4 outputs VMOVDQU (R11), Y10 VMOVDQU (R12), Y11 VMOVDQU (R13), Y12 VMOVDQU (R10), Y13 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R11) ADDQ $0x20, R11 VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x4Xor_loop VZEROUPPER mulAvxGFNI_6x4Xor_end: RET // func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R10 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R10 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X4 VPBROADCASTB X4, Y4 mulAvxTwo_6x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (R11), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x4Xor_loop VZEROUPPER mulAvxTwo_6x4Xor_end: RET // func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R10 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R10 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X5 VPBROADCASTB X5, Y5 mulAvxTwo_6x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x5_loop VZEROUPPER mulAvxTwo_6x5_end: RET // func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64(SB), $0-88 // Loading 25 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 37 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R10 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R10 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, DX mulGFNI_6x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_6x5_64_loop VZEROUPPER mulGFNI_6x5_64_end: RET // func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x5(SB), $0-88 // Loading 9 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 37 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R10 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R10 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, DX mulAvxGFNI_6x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x5_loop VZEROUPPER mulAvxGFNI_6x5_end: RET // func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 // Loading 25 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 37 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R10 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R10 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, DX mulGFNI_6x5_64Xor_loop: // Load 5 outputs VMOVDQU64 (R11), Z25 VMOVDQU64 (R12), Z26 VMOVDQU64 (R13), Z27 VMOVDQU64 (R14), Z28 VMOVDQU64 (R10), Z29 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs VMOVDQU64 Z25, (R11) ADDQ $0x40, R11 VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_6x5_64Xor_loop VZEROUPPER mulGFNI_6x5_64Xor_end: RET // func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88 // Loading 9 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 37 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R10 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R10 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, DX mulAvxGFNI_6x5Xor_loop: // Load 5 outputs VMOVDQU (R11), Y9 VMOVDQU (R12), Y10 VMOVDQU (R13), Y11 VMOVDQU (R14), Y12 VMOVDQU (R10), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R11) ADDQ $0x20, R11 VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x5Xor_loop VZEROUPPER mulAvxGFNI_6x5Xor_end: RET // func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R10 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R10 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X5 VPBROADCASTB X5, Y5 mulAvxTwo_6x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (R11), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x5Xor_loop VZEROUPPER mulAvxTwo_6x5Xor_end: RET // func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 mulAvxTwo_6x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R15) ADDQ $0x20, R15 VMOVDQU Y5, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x6_loop VZEROUPPER mulAvxTwo_6x6_end: RET // func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64(SB), $8-88 // Loading 24 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, DX mulGFNI_6x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_6x6_64_loop VZEROUPPER mulGFNI_6x6_64_end: RET // func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x6(SB), $8-88 // Loading 8 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, DX mulAvxGFNI_6x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x6_loop VZEROUPPER mulAvxGFNI_6x6_end: RET // func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 // Loading 24 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, DX mulGFNI_6x6_64Xor_loop: // Load 6 outputs VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R10), Z29 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_6x6_64Xor_loop VZEROUPPER mulGFNI_6x6_64Xor_end: RET // func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88 // Loading 8 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, DX mulAvxGFNI_6x6Xor_loop: // Load 6 outputs VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R10), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_6x6Xor_loop VZEROUPPER mulAvxGFNI_6x6Xor_end: RET // func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 mulAvxTwo_6x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (R11), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R15), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (R10), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R15) ADDQ $0x20, R15 VMOVDQU Y5, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x6Xor_loop VZEROUPPER mulAvxTwo_6x6Xor_end: RET // func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 96 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x7_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_6x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 ADDQ $0x20, AX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R15) ADDQ $0x20, R15 VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ BP JNZ mulAvxTwo_6x7_loop VZEROUPPER mulAvxTwo_6x7_end: RET // func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64(SB), $8-88 // Loading 23 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 51 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_6x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ BP JNZ mulGFNI_6x7_64_loop VZEROUPPER mulGFNI_6x7_64_end: RET // func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x7(SB), $8-88 // Loading 7 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 51 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_6x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_6x7_loop VZEROUPPER mulAvxGFNI_6x7_end: RET // func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 // Loading 23 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 51 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_6x7_64Xor_loop: // Load 7 outputs VMOVDQU64 (R10), Z23 VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R9), Z29 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R9) ADDQ $0x40, R9 // Prepare for next loop DECQ BP JNZ mulGFNI_6x7_64Xor_loop VZEROUPPER mulGFNI_6x7_64Xor_end: RET // func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88 // Loading 7 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 51 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_6x7Xor_loop: // Load 7 outputs VMOVDQU (R10), Y7 VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R9), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs VMOVDQU Y7, (R10) ADDQ $0x20, R10 VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_6x7Xor_loop VZEROUPPER mulAvxGFNI_6x7Xor_end: RET // func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 96 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x7Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_6x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (R10), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R15), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (R9), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 ADDQ $0x20, AX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R15) ADDQ $0x20, R15 VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ BP JNZ mulAvxTwo_6x7Xor_loop VZEROUPPER mulAvxTwo_6x7Xor_end: RET // func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 109 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X8 VPBROADCASTB X8, Y8 mulAvxTwo_6x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x8_loop VZEROUPPER mulAvxTwo_6x8_end: RET // func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64(SB), $0-88 // Loading 22 of 48 tables to registers // Destination kept on stack // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulGFNI_6x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R10), R12 VMOVDQU64 Z22, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU64 Z23, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU64 Z24, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU64 Z25, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU64 Z26, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU64 Z27, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU64 Z28, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU64 Z29, (R12)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ AX JNZ mulGFNI_6x8_64_loop VZEROUPPER mulGFNI_6x8_64_end: RET // func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x8(SB), $0-88 // Loading 6 of 48 tables to registers // Destination kept on stack // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulAvxGFNI_6x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y9, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y10, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y11, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y12, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y13, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxGFNI_6x8_loop VZEROUPPER mulAvxGFNI_6x8_end: RET // func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 // Loading 22 of 48 tables to registers // Destination kept on stack // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulGFNI_6x8_64Xor_loop: // Load 8 outputs MOVQ (R10), R12 VMOVDQU64 (R12)(R11*1), Z22 MOVQ 24(R10), R12 VMOVDQU64 (R12)(R11*1), Z23 MOVQ 48(R10), R12 VMOVDQU64 (R12)(R11*1), Z24 MOVQ 72(R10), R12 VMOVDQU64 (R12)(R11*1), Z25 MOVQ 96(R10), R12 VMOVDQU64 (R12)(R11*1), Z26 MOVQ 120(R10), R12 VMOVDQU64 (R12)(R11*1), Z27 MOVQ 144(R10), R12 VMOVDQU64 (R12)(R11*1), Z28 MOVQ 168(R10), R12 VMOVDQU64 (R12)(R11*1), Z29 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R10), R12 VMOVDQU64 Z22, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU64 Z23, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU64 Z24, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU64 Z25, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU64 Z26, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU64 Z27, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU64 Z28, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU64 Z29, (R12)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ AX JNZ mulGFNI_6x8_64Xor_loop VZEROUPPER mulGFNI_6x8_64Xor_end: RET // func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88 // Loading 6 of 48 tables to registers // Destination kept on stack // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulAvxGFNI_6x8Xor_loop: // Load 8 outputs MOVQ (R10), R12 VMOVDQU (R12)(R11*1), Y6 MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y7 MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y8 MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y9 MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y10 MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y11 MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y12 MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y9, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y10, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y11, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y12, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y13, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxGFNI_6x8Xor_loop VZEROUPPER mulAvxGFNI_6x8Xor_end: RET // func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 109 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X8 VPBROADCASTB X8, Y8 mulAvxTwo_6x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R10), R12 VMOVDQU (R12)(R11*1), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x8Xor_loop VZEROUPPER mulAvxTwo_6x8Xor_end: RET // func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 122 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X9 VPBROADCASTB X9, Y9 mulAvxTwo_6x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y8, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x9_loop VZEROUPPER mulAvxTwo_6x9_end: RET // func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64(SB), $0-88 // Loading 21 of 54 tables to registers // Destination kept on stack // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulGFNI_6x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R10), R12 VMOVDQU64 Z21, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU64 Z22, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU64 Z23, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU64 Z24, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU64 Z25, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU64 Z26, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU64 Z27, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU64 Z28, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU64 Z29, (R12)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ AX JNZ mulGFNI_6x9_64_loop VZEROUPPER mulGFNI_6x9_64_end: RET // func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x9(SB), $0-88 // Loading 5 of 54 tables to registers // Destination kept on stack // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulAvxGFNI_6x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y9, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y10, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y11, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y12, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y13, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxGFNI_6x9_loop VZEROUPPER mulAvxGFNI_6x9_end: RET // func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 // Loading 21 of 54 tables to registers // Destination kept on stack // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulGFNI_6x9_64Xor_loop: // Load 9 outputs MOVQ (R10), R12 VMOVDQU64 (R12)(R11*1), Z21 MOVQ 24(R10), R12 VMOVDQU64 (R12)(R11*1), Z22 MOVQ 48(R10), R12 VMOVDQU64 (R12)(R11*1), Z23 MOVQ 72(R10), R12 VMOVDQU64 (R12)(R11*1), Z24 MOVQ 96(R10), R12 VMOVDQU64 (R12)(R11*1), Z25 MOVQ 120(R10), R12 VMOVDQU64 (R12)(R11*1), Z26 MOVQ 144(R10), R12 VMOVDQU64 (R12)(R11*1), Z27 MOVQ 168(R10), R12 VMOVDQU64 (R12)(R11*1), Z28 MOVQ 192(R10), R12 VMOVDQU64 (R12)(R11*1), Z29 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R10), R12 VMOVDQU64 Z21, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU64 Z22, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU64 Z23, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU64 Z24, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU64 Z25, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU64 Z26, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU64 Z27, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU64 Z28, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU64 Z29, (R12)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ AX JNZ mulGFNI_6x9_64Xor_loop VZEROUPPER mulGFNI_6x9_64Xor_end: RET // func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88 // Loading 5 of 54 tables to registers // Destination kept on stack // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulAvxGFNI_6x9Xor_loop: // Load 9 outputs MOVQ (R10), R12 VMOVDQU (R12)(R11*1), Y5 MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y6 MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y7 MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y8 MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y9 MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y10 MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y11 MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y12 MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y9, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y10, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y11, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y12, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y13, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxGFNI_6x9Xor_loop VZEROUPPER mulAvxGFNI_6x9Xor_end: RET // func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 122 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X9 VPBROADCASTB X9, Y9 mulAvxTwo_6x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R10), R12 VMOVDQU (R12)(R11*1), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y8, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x9Xor_loop VZEROUPPER mulAvxTwo_6x9Xor_end: RET // func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 135 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X10 VPBROADCASTB X10, Y10 mulAvxTwo_6x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 216(R10), R12 VMOVDQU Y9, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x10_loop VZEROUPPER mulAvxTwo_6x10_end: RET // func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64(SB), $0-88 // Loading 20 of 60 tables to registers // Destination kept on stack // Full registers estimated 72 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulGFNI_6x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R10), R12 VMOVDQU64 Z20, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU64 Z21, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU64 Z22, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU64 Z23, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU64 Z24, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU64 Z25, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU64 Z26, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU64 Z27, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU64 Z28, (R12)(R11*1) MOVQ 216(R10), R12 VMOVDQU64 Z29, (R12)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ AX JNZ mulGFNI_6x10_64_loop VZEROUPPER mulGFNI_6x10_64_end: RET // func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x10(SB), $0-88 // Loading 4 of 60 tables to registers // Destination kept on stack // Full registers estimated 72 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulAvxGFNI_6x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y9, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y10, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y11, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y12, (R12)(R11*1) MOVQ 216(R10), R12 VMOVDQU Y13, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxGFNI_6x10_loop VZEROUPPER mulAvxGFNI_6x10_end: RET // func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 // Loading 20 of 60 tables to registers // Destination kept on stack // Full registers estimated 72 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_6x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulGFNI_6x10_64Xor_loop: // Load 10 outputs MOVQ (R10), R12 VMOVDQU64 (R12)(R11*1), Z20 MOVQ 24(R10), R12 VMOVDQU64 (R12)(R11*1), Z21 MOVQ 48(R10), R12 VMOVDQU64 (R12)(R11*1), Z22 MOVQ 72(R10), R12 VMOVDQU64 (R12)(R11*1), Z23 MOVQ 96(R10), R12 VMOVDQU64 (R12)(R11*1), Z24 MOVQ 120(R10), R12 VMOVDQU64 (R12)(R11*1), Z25 MOVQ 144(R10), R12 VMOVDQU64 (R12)(R11*1), Z26 MOVQ 168(R10), R12 VMOVDQU64 (R12)(R11*1), Z27 MOVQ 192(R10), R12 VMOVDQU64 (R12)(R11*1), Z28 MOVQ 216(R10), R12 VMOVDQU64 (R12)(R11*1), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R10), R12 VMOVDQU64 Z20, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU64 Z21, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU64 Z22, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU64 Z23, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU64 Z24, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU64 Z25, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU64 Z26, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU64 Z27, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU64 Z28, (R12)(R11*1) MOVQ 216(R10), R12 VMOVDQU64 Z29, (R12)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ AX JNZ mulGFNI_6x10_64Xor_loop VZEROUPPER mulGFNI_6x10_64Xor_end: RET // func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88 // Loading 4 of 60 tables to registers // Destination kept on stack // Full registers estimated 72 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_6x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX mulAvxGFNI_6x10Xor_loop: // Load 10 outputs MOVQ (R10), R12 VMOVDQU (R12)(R11*1), Y4 MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y5 MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y6 MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y7 MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y8 MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y9 MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y10 MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y11 MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y12 MOVQ 216(R10), R12 VMOVDQU (R12)(R11*1), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y9, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y10, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y11, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y12, (R12)(R11*1) MOVQ 216(R10), R12 VMOVDQU Y13, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxGFNI_6x10Xor_loop VZEROUPPER mulAvxGFNI_6x10Xor_end: RET // func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 135 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X10 VPBROADCASTB X10, Y10 mulAvxTwo_6x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R10), R12 VMOVDQU (R12)(R11*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R10), R12 VMOVDQU (R12)(R11*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 216(R10), R12 VMOVDQU Y9, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x10Xor_loop VZEROUPPER mulAvxTwo_6x10Xor_end: RET // func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x1_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X2 VPBROADCASTB X2, Y2 mulAvxTwo_7x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y0 VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x1_64_loop VZEROUPPER mulAvxTwo_7x1_64_end: RET // func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), CX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, CX mulGFNI_7x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z8 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z8, Z7 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z8 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z8 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z8 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z8 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (R9), Z8 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU64 (CX), Z8 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 VXORPD Z7, Z8, Z7 // Store 1 outputs VMOVDQU64 Z7, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_7x1_64_loop VZEROUPPER mulGFNI_7x1_64_end: RET // func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), CX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, CX mulAvxGFNI_7x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 VXORPD Y7, Y8, Y7 // Store 1 outputs VMOVDQU Y7, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x1_loop VZEROUPPER mulAvxGFNI_7x1_end: RET // func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), CX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, CX mulGFNI_7x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (R10), Z7 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z8 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z8 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z8 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z8 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z8 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (R9), Z8 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 VXORPD Z7, Z8, Z7 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU64 (CX), Z8 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 VXORPD Z7, Z8, Z7 // Store 1 outputs VMOVDQU64 Z7, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_7x1_64Xor_loop VZEROUPPER mulGFNI_7x1_64Xor_end: RET // func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), CX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, CX mulAvxGFNI_7x1Xor_loop: // Load 1 outputs VMOVDQU (R10), Y7 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 VXORPD Y7, Y8, Y7 // Store 1 outputs VMOVDQU Y7, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x1Xor_loop VZEROUPPER mulAvxGFNI_7x1Xor_end: RET // func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X2 VPBROADCASTB X2, Y2 mulAvxTwo_7x1_64Xor_loop: // Load 1 outputs VMOVDQU (R11), Y0 VMOVDQU 32(R11), Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x1_64Xor_loop VZEROUPPER mulAvxTwo_7x1_64Xor_end: RET // func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 ADDQ R13, R11 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X4 VPBROADCASTB X4, Y4 mulAvxTwo_7x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) ADDQ $0x40, R12 VMOVDQU Y2, (R11) VMOVDQU Y3, 32(R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x2_64_loop VZEROUPPER mulAvxTwo_7x2_64_end: RET // func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), CX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 ADDQ R12, R10 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, CX mulGFNI_7x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z16 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z16, Z14 VGF2P8AFFINEQB $0x00, Z1, Z16, Z15 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z16 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z16 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z16 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z16 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (R9), Z16 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU64 (CX), Z16 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 VXORPD Z15, Z17, Z15 // Store 2 outputs VMOVDQU64 Z14, (R11) ADDQ $0x40, R11 VMOVDQU64 Z15, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_7x2_64_loop VZEROUPPER mulGFNI_7x2_64_end: RET // func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x2(SB), $0-88 // Loading 12 of 14 tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 ADDQ R13, R11 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, DX mulAvxGFNI_7x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x2_loop VZEROUPPER mulAvxGFNI_7x2_end: RET // func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), CX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 ADDQ R12, R10 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, CX mulGFNI_7x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (R11), Z14 VMOVDQU64 (R10), Z15 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z16 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z16 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z16 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z16 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z16 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (R9), Z16 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 VXORPD Z15, Z17, Z15 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU64 (CX), Z16 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 VXORPD Z14, Z17, Z14 VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 VXORPD Z15, Z17, Z15 // Store 2 outputs VMOVDQU64 Z14, (R11) ADDQ $0x40, R11 VMOVDQU64 Z15, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_7x2_64Xor_loop VZEROUPPER mulGFNI_7x2_64Xor_end: RET // func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88 // Loading 12 of 14 tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 ADDQ R13, R11 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, DX mulAvxGFNI_7x2Xor_loop: // Load 2 outputs VMOVDQU (R12), Y12 VMOVDQU (R11), Y13 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R12) ADDQ $0x20, R12 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x2Xor_loop VZEROUPPER mulAvxGFNI_7x2Xor_end: RET // func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 ADDQ R13, R11 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X4 VPBROADCASTB X4, Y4 mulAvxTwo_7x2_64Xor_loop: // Load 2 outputs VMOVDQU (R12), Y0 VMOVDQU 32(R12), Y1 VMOVDQU (R11), Y2 VMOVDQU 32(R11), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) ADDQ $0x40, R12 VMOVDQU Y2, (R11) VMOVDQU Y3, 32(R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x2_64Xor_loop VZEROUPPER mulAvxTwo_7x2_64Xor_end: RET // func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 94 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R11 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 mulAvxTwo_7x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) ADDQ $0x40, R12 VMOVDQU Y2, (R13) VMOVDQU Y3, 32(R13) ADDQ $0x40, R13 VMOVDQU Y4, (R11) VMOVDQU Y5, 32(R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x3_64_loop VZEROUPPER mulAvxTwo_7x3_64_end: RET // func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), CX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R10 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, CX mulGFNI_7x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z24 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z24, Z21 VGF2P8AFFINEQB $0x00, Z1, Z24, Z22 VGF2P8AFFINEQB $0x00, Z2, Z24, Z23 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z24 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z24 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z24 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z24 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (R9), Z24 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU64 (CX), Z24 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 VXORPD Z23, Z25, Z23 // Store 3 outputs VMOVDQU64 Z21, (R11) ADDQ $0x40, R11 VMOVDQU64 Z22, (R12) ADDQ $0x40, R12 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_7x3_64_loop VZEROUPPER mulGFNI_7x3_64_end: RET // func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x3(SB), $0-88 // Loading 11 of 21 tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R11 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, DX mulAvxGFNI_7x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x3_loop VZEROUPPER mulAvxGFNI_7x3_end: RET // func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), CX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R10 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, CX mulGFNI_7x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (R11), Z21 VMOVDQU64 (R12), Z22 VMOVDQU64 (R10), Z23 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z24 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z24 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z24 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z24 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z24 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (R9), Z24 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 VXORPD Z23, Z25, Z23 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU64 (CX), Z24 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 VXORPD Z21, Z25, Z21 VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 VXORPD Z22, Z25, Z22 VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 VXORPD Z23, Z25, Z23 // Store 3 outputs VMOVDQU64 Z21, (R11) ADDQ $0x40, R11 VMOVDQU64 Z22, (R12) ADDQ $0x40, R12 VMOVDQU64 Z23, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ AX JNZ mulGFNI_7x3_64Xor_loop VZEROUPPER mulGFNI_7x3_64Xor_end: RET // func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88 // Loading 11 of 21 tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R11 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, DX mulAvxGFNI_7x3Xor_loop: // Load 3 outputs VMOVDQU (R12), Y11 VMOVDQU (R13), Y12 VMOVDQU (R11), Y13 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R12) ADDQ $0x20, R12 VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x3Xor_loop VZEROUPPER mulAvxGFNI_7x3Xor_end: RET // func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 94 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R11 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 mulAvxTwo_7x3_64Xor_loop: // Load 3 outputs VMOVDQU (R12), Y0 VMOVDQU 32(R12), Y1 VMOVDQU (R13), Y2 VMOVDQU 32(R13), Y3 VMOVDQU (R11), Y4 VMOVDQU 32(R11), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) ADDQ $0x40, R12 VMOVDQU Y2, (R13) VMOVDQU Y3, 32(R13) ADDQ $0x40, R13 VMOVDQU Y4, (R11) VMOVDQU Y5, 32(R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x3_64Xor_loop VZEROUPPER mulAvxTwo_7x3_64Xor_end: RET // func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R11 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R11 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X4 VPBROADCASTB X4, Y4 mulAvxTwo_7x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x4_loop VZEROUPPER mulAvxTwo_7x4_end: RET // func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64(SB), $0-88 // Loading 26 of 28 tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R11 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R11 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, DX mulGFNI_7x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 4 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 4 outputs VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_7x4_64_loop VZEROUPPER mulGFNI_7x4_64_end: RET // func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x4(SB), $0-88 // Loading 10 of 28 tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R11 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R11 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, DX mulAvxGFNI_7x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x4_loop VZEROUPPER mulAvxGFNI_7x4_end: RET // func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 // Loading 26 of 28 tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R11 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R11 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, DX mulGFNI_7x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (R12), Z26 VMOVDQU64 (R13), Z27 VMOVDQU64 (R14), Z28 VMOVDQU64 (R11), Z29 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 4 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 4 outputs VMOVDQU64 Z26, (R12) ADDQ $0x40, R12 VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_7x4_64Xor_loop VZEROUPPER mulGFNI_7x4_64Xor_end: RET // func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88 // Loading 10 of 28 tables to registers // Destination kept in GP registers // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R11 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R11 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, DX mulAvxGFNI_7x4Xor_loop: // Load 4 outputs VMOVDQU (R12), Y10 VMOVDQU (R13), Y11 VMOVDQU (R14), Y12 VMOVDQU (R11), Y13 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R12) ADDQ $0x20, R12 VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x4Xor_loop VZEROUPPER mulAvxGFNI_7x4Xor_end: RET // func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R11 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R11 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X4 VPBROADCASTB X4, Y4 mulAvxTwo_7x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (R12), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x4Xor_loop VZEROUPPER mulAvxTwo_7x4Xor_end: RET // func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 80 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 mulAvxTwo_7x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R15) ADDQ $0x20, R15 VMOVDQU Y4, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x5_loop VZEROUPPER mulAvxTwo_7x5_end: RET // func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64(SB), $8-88 // Loading 25 of 35 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, DX mulGFNI_7x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_7x5_64_loop VZEROUPPER mulGFNI_7x5_64_end: RET // func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x5(SB), $8-88 // Loading 9 of 35 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, DX mulAvxGFNI_7x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x5_loop VZEROUPPER mulAvxGFNI_7x5_end: RET // func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 // Loading 25 of 35 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, DX mulGFNI_7x5_64Xor_loop: // Load 5 outputs VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R11), Z29 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_7x5_64Xor_loop VZEROUPPER mulGFNI_7x5_64Xor_end: RET // func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88 // Loading 9 of 35 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, DX mulAvxGFNI_7x5Xor_loop: // Load 5 outputs VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R11), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_7x5Xor_loop VZEROUPPER mulAvxGFNI_7x5Xor_end: RET // func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 80 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 mulAvxTwo_7x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (R12), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R15), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R15) ADDQ $0x20, R15 VMOVDQU Y4, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x5Xor_loop VZEROUPPER mulAvxTwo_7x5Xor_end: RET // func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 95 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x6_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_7x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 ADDQ $0x20, AX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R15) ADDQ $0x20, R15 VMOVDQU Y5, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ BP JNZ mulAvxTwo_7x6_loop VZEROUPPER mulAvxTwo_7x6_end: RET // func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x6_64(SB), $8-88 // Loading 24 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_7x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 6 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ BP JNZ mulGFNI_7x6_64_loop VZEROUPPER mulGFNI_7x6_64_end: RET // func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x6(SB), $8-88 // Loading 8 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_7x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_7x6_loop VZEROUPPER mulAvxGFNI_7x6_end: RET // func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 // Loading 24 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_7x6_64Xor_loop: // Load 6 outputs VMOVDQU64 (R11), Z24 VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R10), Z29 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 6 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs VMOVDQU64 Z24, (R11) ADDQ $0x40, R11 VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R10) ADDQ $0x40, R10 // Prepare for next loop DECQ BP JNZ mulGFNI_7x6_64Xor_loop VZEROUPPER mulGFNI_7x6_64Xor_end: RET // func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88 // Loading 8 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_7x6Xor_loop: // Load 6 outputs VMOVDQU (R11), Y8 VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R10), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs VMOVDQU Y8, (R11) ADDQ $0x20, R11 VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_7x6Xor_loop VZEROUPPER mulAvxGFNI_7x6Xor_end: RET // func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 95 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x6Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_7x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (R11), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R15), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (R10), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 ADDQ $0x20, AX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R15) ADDQ $0x20, R15 VMOVDQU Y5, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ BP JNZ mulAvxTwo_7x6Xor_loop VZEROUPPER mulAvxTwo_7x6Xor_end: RET // func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 110 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X7 VPBROADCASTB X7, Y7 mulAvxTwo_7x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x7_loop VZEROUPPER mulAvxTwo_7x7_end: RET // func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x7_64(SB), $0-88 // Loading 23 of 49 tables to registers // Destination kept on stack // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulGFNI_7x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs MOVQ (R11), R13 VMOVDQU64 Z23, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU64 Z24, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU64 Z25, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU64 Z26, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU64 Z27, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU64 Z28, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU64 Z29, (R13)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ AX JNZ mulGFNI_7x7_64_loop VZEROUPPER mulGFNI_7x7_64_end: RET // func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x7(SB), $0-88 // Loading 7 of 49 tables to registers // Destination kept on stack // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulAvxGFNI_7x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y9, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y10, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y11, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y12, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y13, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxGFNI_7x7_loop VZEROUPPER mulAvxGFNI_7x7_end: RET // func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 // Loading 23 of 49 tables to registers // Destination kept on stack // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulGFNI_7x7_64Xor_loop: // Load 7 outputs MOVQ (R11), R13 VMOVDQU64 (R13)(R12*1), Z23 MOVQ 24(R11), R13 VMOVDQU64 (R13)(R12*1), Z24 MOVQ 48(R11), R13 VMOVDQU64 (R13)(R12*1), Z25 MOVQ 72(R11), R13 VMOVDQU64 (R13)(R12*1), Z26 MOVQ 96(R11), R13 VMOVDQU64 (R13)(R12*1), Z27 MOVQ 120(R11), R13 VMOVDQU64 (R13)(R12*1), Z28 MOVQ 144(R11), R13 VMOVDQU64 (R13)(R12*1), Z29 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs MOVQ (R11), R13 VMOVDQU64 Z23, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU64 Z24, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU64 Z25, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU64 Z26, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU64 Z27, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU64 Z28, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU64 Z29, (R13)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ AX JNZ mulGFNI_7x7_64Xor_loop VZEROUPPER mulGFNI_7x7_64Xor_end: RET // func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88 // Loading 7 of 49 tables to registers // Destination kept on stack // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulAvxGFNI_7x7Xor_loop: // Load 7 outputs MOVQ (R11), R13 VMOVDQU (R13)(R12*1), Y7 MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y8 MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y9 MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y10 MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y11 MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y12 MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y9, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y10, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y11, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y12, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y13, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxGFNI_7x7Xor_loop VZEROUPPER mulAvxGFNI_7x7Xor_end: RET // func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 110 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X7 VPBROADCASTB X7, Y7 mulAvxTwo_7x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 MOVQ (R11), R13 VMOVDQU (R13)(R12*1), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x7Xor_loop VZEROUPPER mulAvxTwo_7x7Xor_end: RET // func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 125 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X8 VPBROADCASTB X8, Y8 mulAvxTwo_7x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x8_loop VZEROUPPER mulAvxTwo_7x8_end: RET // func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64(SB), $0-88 // Loading 22 of 56 tables to registers // Destination kept on stack // Full registers estimated 66 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulGFNI_7x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R11), R13 VMOVDQU64 Z22, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU64 Z23, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU64 Z24, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU64 Z25, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU64 Z26, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU64 Z27, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU64 Z28, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU64 Z29, (R13)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ AX JNZ mulGFNI_7x8_64_loop VZEROUPPER mulGFNI_7x8_64_end: RET // func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x8(SB), $0-88 // Loading 6 of 56 tables to registers // Destination kept on stack // Full registers estimated 66 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulAvxGFNI_7x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y9, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y10, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y11, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y12, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y13, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxGFNI_7x8_loop VZEROUPPER mulAvxGFNI_7x8_end: RET // func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 // Loading 22 of 56 tables to registers // Destination kept on stack // Full registers estimated 66 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulGFNI_7x8_64Xor_loop: // Load 8 outputs MOVQ (R11), R13 VMOVDQU64 (R13)(R12*1), Z22 MOVQ 24(R11), R13 VMOVDQU64 (R13)(R12*1), Z23 MOVQ 48(R11), R13 VMOVDQU64 (R13)(R12*1), Z24 MOVQ 72(R11), R13 VMOVDQU64 (R13)(R12*1), Z25 MOVQ 96(R11), R13 VMOVDQU64 (R13)(R12*1), Z26 MOVQ 120(R11), R13 VMOVDQU64 (R13)(R12*1), Z27 MOVQ 144(R11), R13 VMOVDQU64 (R13)(R12*1), Z28 MOVQ 168(R11), R13 VMOVDQU64 (R13)(R12*1), Z29 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R11), R13 VMOVDQU64 Z22, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU64 Z23, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU64 Z24, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU64 Z25, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU64 Z26, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU64 Z27, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU64 Z28, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU64 Z29, (R13)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ AX JNZ mulGFNI_7x8_64Xor_loop VZEROUPPER mulGFNI_7x8_64Xor_end: RET // func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88 // Loading 6 of 56 tables to registers // Destination kept on stack // Full registers estimated 66 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulAvxGFNI_7x8Xor_loop: // Load 8 outputs MOVQ (R11), R13 VMOVDQU (R13)(R12*1), Y6 MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y7 MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y8 MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y9 MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y10 MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y11 MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y12 MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y9, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y10, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y11, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y12, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y13, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxGFNI_7x8Xor_loop VZEROUPPER mulAvxGFNI_7x8Xor_end: RET // func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 125 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X8 VPBROADCASTB X8, Y8 mulAvxTwo_7x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R11), R13 VMOVDQU (R13)(R12*1), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x8Xor_loop VZEROUPPER mulAvxTwo_7x8Xor_end: RET // func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 140 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X9 VPBROADCASTB X9, Y9 mulAvxTwo_7x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y8, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x9_loop VZEROUPPER mulAvxTwo_7x9_end: RET // func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64(SB), $0-88 // Loading 21 of 63 tables to registers // Destination kept on stack // Full registers estimated 74 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulGFNI_7x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R11), R13 VMOVDQU64 Z21, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU64 Z22, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU64 Z23, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU64 Z24, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU64 Z25, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU64 Z26, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU64 Z27, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU64 Z28, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU64 Z29, (R13)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ AX JNZ mulGFNI_7x9_64_loop VZEROUPPER mulGFNI_7x9_64_end: RET // func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x9(SB), $0-88 // Loading 5 of 63 tables to registers // Destination kept on stack // Full registers estimated 74 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulAvxGFNI_7x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y9, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y10, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y11, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y12, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y13, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxGFNI_7x9_loop VZEROUPPER mulAvxGFNI_7x9_end: RET // func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 // Loading 21 of 63 tables to registers // Destination kept on stack // Full registers estimated 74 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulGFNI_7x9_64Xor_loop: // Load 9 outputs MOVQ (R11), R13 VMOVDQU64 (R13)(R12*1), Z21 MOVQ 24(R11), R13 VMOVDQU64 (R13)(R12*1), Z22 MOVQ 48(R11), R13 VMOVDQU64 (R13)(R12*1), Z23 MOVQ 72(R11), R13 VMOVDQU64 (R13)(R12*1), Z24 MOVQ 96(R11), R13 VMOVDQU64 (R13)(R12*1), Z25 MOVQ 120(R11), R13 VMOVDQU64 (R13)(R12*1), Z26 MOVQ 144(R11), R13 VMOVDQU64 (R13)(R12*1), Z27 MOVQ 168(R11), R13 VMOVDQU64 (R13)(R12*1), Z28 MOVQ 192(R11), R13 VMOVDQU64 (R13)(R12*1), Z29 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R11), R13 VMOVDQU64 Z21, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU64 Z22, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU64 Z23, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU64 Z24, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU64 Z25, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU64 Z26, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU64 Z27, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU64 Z28, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU64 Z29, (R13)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ AX JNZ mulGFNI_7x9_64Xor_loop VZEROUPPER mulGFNI_7x9_64Xor_end: RET // func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88 // Loading 5 of 63 tables to registers // Destination kept on stack // Full registers estimated 74 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulAvxGFNI_7x9Xor_loop: // Load 9 outputs MOVQ (R11), R13 VMOVDQU (R13)(R12*1), Y5 MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y6 MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y7 MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y8 MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y9 MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y10 MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y11 MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y12 MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y9, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y10, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y11, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y12, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y13, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxGFNI_7x9Xor_loop VZEROUPPER mulAvxGFNI_7x9Xor_end: RET // func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 140 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X9 VPBROADCASTB X9, Y9 mulAvxTwo_7x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R11), R13 VMOVDQU (R13)(R12*1), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y8, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x9Xor_loop VZEROUPPER mulAvxTwo_7x9Xor_end: RET // func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 155 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X10 VPBROADCASTB X10, Y10 mulAvxTwo_7x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 216(R11), R13 VMOVDQU Y9, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x10_loop VZEROUPPER mulAvxTwo_7x10_end: RET // func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64(SB), $0-88 // Loading 20 of 70 tables to registers // Destination kept on stack // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulGFNI_7x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R11), R13 VMOVDQU64 Z20, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU64 Z21, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU64 Z22, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU64 Z23, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU64 Z24, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU64 Z25, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU64 Z26, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU64 Z27, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU64 Z28, (R13)(R12*1) MOVQ 216(R11), R13 VMOVDQU64 Z29, (R13)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ AX JNZ mulGFNI_7x10_64_loop VZEROUPPER mulGFNI_7x10_64_end: RET // func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x10(SB), $0-88 // Loading 4 of 70 tables to registers // Destination kept on stack // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulAvxGFNI_7x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y9, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y10, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y11, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y12, (R13)(R12*1) MOVQ 216(R11), R13 VMOVDQU Y13, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxGFNI_7x10_loop VZEROUPPER mulAvxGFNI_7x10_end: RET // func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 // Loading 20 of 70 tables to registers // Destination kept on stack // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_7x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulGFNI_7x10_64Xor_loop: // Load 10 outputs MOVQ (R11), R13 VMOVDQU64 (R13)(R12*1), Z20 MOVQ 24(R11), R13 VMOVDQU64 (R13)(R12*1), Z21 MOVQ 48(R11), R13 VMOVDQU64 (R13)(R12*1), Z22 MOVQ 72(R11), R13 VMOVDQU64 (R13)(R12*1), Z23 MOVQ 96(R11), R13 VMOVDQU64 (R13)(R12*1), Z24 MOVQ 120(R11), R13 VMOVDQU64 (R13)(R12*1), Z25 MOVQ 144(R11), R13 VMOVDQU64 (R13)(R12*1), Z26 MOVQ 168(R11), R13 VMOVDQU64 (R13)(R12*1), Z27 MOVQ 192(R11), R13 VMOVDQU64 (R13)(R12*1), Z28 MOVQ 216(R11), R13 VMOVDQU64 (R13)(R12*1), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R11), R13 VMOVDQU64 Z20, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU64 Z21, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU64 Z22, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU64 Z23, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU64 Z24, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU64 Z25, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU64 Z26, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU64 Z27, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU64 Z28, (R13)(R12*1) MOVQ 216(R11), R13 VMOVDQU64 Z29, (R13)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ AX JNZ mulGFNI_7x10_64Xor_loop VZEROUPPER mulGFNI_7x10_64Xor_end: RET // func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88 // Loading 4 of 70 tables to registers // Destination kept on stack // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_7x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX mulAvxGFNI_7x10Xor_loop: // Load 10 outputs MOVQ (R11), R13 VMOVDQU (R13)(R12*1), Y4 MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y5 MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y6 MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y7 MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y8 MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y9 MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y10 MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y11 MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y12 MOVQ 216(R11), R13 VMOVDQU (R13)(R12*1), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y9, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y10, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y11, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y12, (R13)(R12*1) MOVQ 216(R11), R13 VMOVDQU Y13, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxGFNI_7x10Xor_loop VZEROUPPER mulAvxGFNI_7x10Xor_end: RET // func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 155 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X10 VPBROADCASTB X10, Y10 mulAvxTwo_7x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R11), R13 VMOVDQU (R13)(R12*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R11), R13 VMOVDQU (R13)(R12*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 216(R11), R13 VMOVDQU Y9, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x10Xor_loop VZEROUPPER mulAvxTwo_7x10Xor_end: RET // func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x1_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X2 VPBROADCASTB X2, Y2 mulAvxTwo_8x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y0 VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x1_64_loop VZEROUPPER mulAvxTwo_8x1_64_end: RET // func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), CX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, CX mulGFNI_8x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z9 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z9 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z9 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z9 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z9 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (R9), Z9 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU64 (R10), Z9 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU64 (CX), Z9 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 VXORPD Z8, Z9, Z8 // Store 1 outputs VMOVDQU64 Z8, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_8x1_64_loop VZEROUPPER mulGFNI_8x1_64_end: RET // func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), CX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, CX mulAvxGFNI_8x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (CX), Y9 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 VXORPD Y8, Y9, Y8 // Store 1 outputs VMOVDQU Y8, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_8x1_loop VZEROUPPER mulAvxGFNI_8x1_end: RET // func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), CX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, CX mulGFNI_8x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (R11), Z8 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z9 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z9 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z9 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z9 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z9 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (R9), Z9 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU64 (R10), Z9 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU64 (CX), Z9 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 VXORPD Z8, Z9, Z8 // Store 1 outputs VMOVDQU64 Z8, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_8x1_64Xor_loop VZEROUPPER mulGFNI_8x1_64Xor_end: RET // func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), CX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, CX mulAvxGFNI_8x1Xor_loop: // Load 1 outputs VMOVDQU (R11), Y8 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (CX), Y9 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 VXORPD Y8, Y9, Y8 // Store 1 outputs VMOVDQU Y8, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_8x1Xor_loop VZEROUPPER mulAvxGFNI_8x1Xor_end: RET // func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X2 VPBROADCASTB X2, Y2 mulAvxTwo_8x1_64Xor_loop: // Load 1 outputs VMOVDQU (R12), Y0 VMOVDQU 32(R12), Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x1_64Xor_loop VZEROUPPER mulAvxTwo_8x1_64Xor_end: RET // func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 73 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X4 VPBROADCASTB X4, Y4 mulAvxTwo_8x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) ADDQ $0x40, R13 VMOVDQU Y2, (R12) VMOVDQU Y3, 32(R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x2_64_loop VZEROUPPER mulAvxTwo_8x2_64_end: RET // func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), CX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 ADDQ R13, R11 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, CX mulGFNI_8x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z18 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z18, Z16 VGF2P8AFFINEQB $0x00, Z1, Z18, Z17 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z18 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z18 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z18 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z18 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (R9), Z18 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU64 (R10), Z18 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU64 (CX), Z18 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 VXORPD Z17, Z19, Z17 // Store 2 outputs VMOVDQU64 Z16, (R12) ADDQ $0x40, R12 VMOVDQU64 Z17, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_8x2_64_loop VZEROUPPER mulGFNI_8x2_64_end: RET // func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x2(SB), $0-88 // Loading 12 of 16 tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, DX mulAvxGFNI_8x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_8x2_loop VZEROUPPER mulAvxGFNI_8x2_end: RET // func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), CX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 ADDQ R13, R11 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, CX mulGFNI_8x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (R12), Z16 VMOVDQU64 (R11), Z17 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z18 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z18 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z18 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z18 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z18 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (R9), Z18 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU64 (R10), Z18 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 VXORPD Z17, Z19, Z17 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU64 (CX), Z18 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 VXORPD Z16, Z19, Z16 VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 VXORPD Z17, Z19, Z17 // Store 2 outputs VMOVDQU64 Z16, (R12) ADDQ $0x40, R12 VMOVDQU64 Z17, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_8x2_64Xor_loop VZEROUPPER mulGFNI_8x2_64Xor_end: RET // func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88 // Loading 12 of 16 tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, DX mulAvxGFNI_8x2Xor_loop: // Load 2 outputs VMOVDQU (R13), Y12 VMOVDQU (R12), Y13 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R13) ADDQ $0x20, R13 VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_8x2Xor_loop VZEROUPPER mulAvxGFNI_8x2Xor_end: RET // func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 73 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X4 VPBROADCASTB X4, Y4 mulAvxTwo_8x2_64Xor_loop: // Load 2 outputs VMOVDQU (R13), Y0 VMOVDQU 32(R13), Y1 VMOVDQU (R12), Y2 VMOVDQU 32(R12), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) ADDQ $0x40, R13 VMOVDQU Y2, (R12) VMOVDQU Y3, 32(R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x2_64Xor_loop VZEROUPPER mulAvxTwo_8x2_64Xor_end: RET // func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 106 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R12 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 mulAvxTwo_8x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) ADDQ $0x40, R13 VMOVDQU Y2, (R14) VMOVDQU Y3, 32(R14) ADDQ $0x40, R14 VMOVDQU Y4, (R12) VMOVDQU Y5, 32(R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x3_64_loop VZEROUPPER mulAvxTwo_8x3_64_end: RET // func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), CX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R11 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, CX mulGFNI_8x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z27 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z27, Z24 VGF2P8AFFINEQB $0x00, Z1, Z27, Z25 VGF2P8AFFINEQB $0x00, Z2, Z27, Z26 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z27 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z27 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z27 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z27 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (R9), Z27 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU64 (R10), Z27 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU64 (CX), Z27 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 VXORPD Z26, Z28, Z26 // Store 3 outputs VMOVDQU64 Z24, (R12) ADDQ $0x40, R12 VMOVDQU64 Z25, (R13) ADDQ $0x40, R13 VMOVDQU64 Z26, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_8x3_64_loop VZEROUPPER mulGFNI_8x3_64_end: RET // func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x3(SB), $0-88 // Loading 11 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R12 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, DX mulAvxGFNI_8x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_8x3_loop VZEROUPPER mulAvxGFNI_8x3_end: RET // func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), CX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R11 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, CX mulGFNI_8x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (R12), Z24 VMOVDQU64 (R13), Z25 VMOVDQU64 (R11), Z26 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z27 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z27 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z27 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z27 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z27 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (R9), Z27 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU64 (R10), Z27 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 VXORPD Z26, Z28, Z26 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU64 (CX), Z27 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 VXORPD Z24, Z28, Z24 VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 VXORPD Z25, Z28, Z25 VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 VXORPD Z26, Z28, Z26 // Store 3 outputs VMOVDQU64 Z24, (R12) ADDQ $0x40, R12 VMOVDQU64 Z25, (R13) ADDQ $0x40, R13 VMOVDQU64 Z26, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ AX JNZ mulGFNI_8x3_64Xor_loop VZEROUPPER mulGFNI_8x3_64Xor_end: RET // func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88 // Loading 11 of 24 tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R12 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, DX mulAvxGFNI_8x3Xor_loop: // Load 3 outputs VMOVDQU (R13), Y11 VMOVDQU (R14), Y12 VMOVDQU (R12), Y13 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R13) ADDQ $0x20, R13 VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_8x3Xor_loop VZEROUPPER mulAvxGFNI_8x3Xor_end: RET // func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 106 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R12 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 mulAvxTwo_8x3_64Xor_loop: // Load 3 outputs VMOVDQU (R13), Y0 VMOVDQU 32(R13), Y1 VMOVDQU (R14), Y2 VMOVDQU 32(R14), Y3 VMOVDQU (R12), Y4 VMOVDQU 32(R12), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) ADDQ $0x40, R13 VMOVDQU Y2, (R14) VMOVDQU Y3, 32(R14) ADDQ $0x40, R14 VMOVDQU Y4, (R12) VMOVDQU Y5, 32(R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x3_64Xor_loop VZEROUPPER mulAvxTwo_8x3_64Xor_end: RET // func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 73 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 mulAvxTwo_8x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 VMOVDQU Y1, (R14) ADDQ $0x20, R14 VMOVDQU Y2, (R15) ADDQ $0x20, R15 VMOVDQU Y3, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x4_loop VZEROUPPER mulAvxTwo_8x4_end: RET // func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64(SB), $8-88 // Loading 26 of 32 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, DX mulGFNI_8x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 4 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 4 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 4 outputs VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulGFNI_8x4_64_loop VZEROUPPER mulGFNI_8x4_64_end: RET // func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x4(SB), $8-88 // Loading 10 of 32 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, DX mulAvxGFNI_8x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_8x4_loop VZEROUPPER mulAvxGFNI_8x4_end: RET // func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 // Loading 26 of 32 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, DX mulGFNI_8x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R12), Z29 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 4 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 4 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 4 outputs VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulGFNI_8x4_64Xor_loop VZEROUPPER mulGFNI_8x4_64Xor_end: RET // func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88 // Loading 10 of 32 tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, DX mulAvxGFNI_8x4Xor_loop: // Load 4 outputs VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R12), Y13 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_8x4Xor_loop VZEROUPPER mulAvxGFNI_8x4Xor_end: RET // func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 73 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 mulAvxTwo_8x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (R13), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R15), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 VMOVDQU Y1, (R14) ADDQ $0x20, R14 VMOVDQU Y2, (R15) ADDQ $0x20, R15 VMOVDQU Y3, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x4Xor_loop VZEROUPPER mulAvxTwo_8x4Xor_end: RET // func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 90 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x5_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_8x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 ADDQ $0x20, AX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R15) ADDQ $0x20, R15 VMOVDQU Y4, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ BP JNZ mulAvxTwo_8x5_loop VZEROUPPER mulAvxTwo_8x5_end: RET // func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64(SB), $8-88 // Loading 25 of 40 tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_8x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 5 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 5 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ BP JNZ mulGFNI_8x5_64_loop VZEROUPPER mulGFNI_8x5_64_end: RET // func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x5(SB), $8-88 // Loading 9 of 40 tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_8x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_8x5_loop VZEROUPPER mulAvxGFNI_8x5_end: RET // func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 // Loading 25 of 40 tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_8x5_64Xor_loop: // Load 5 outputs VMOVDQU64 (R12), Z25 VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R11), Z29 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 5 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 5 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs VMOVDQU64 Z25, (R12) ADDQ $0x40, R12 VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R11) ADDQ $0x40, R11 // Prepare for next loop DECQ BP JNZ mulGFNI_8x5_64Xor_loop VZEROUPPER mulGFNI_8x5_64Xor_end: RET // func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88 // Loading 9 of 40 tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_8x5Xor_loop: // Load 5 outputs VMOVDQU (R12), Y9 VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R11), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs VMOVDQU Y9, (R12) ADDQ $0x20, R12 VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_8x5Xor_loop VZEROUPPER mulAvxGFNI_8x5Xor_end: RET // func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 90 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x5Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_8x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (R12), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R15), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 ADDQ $0x20, AX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R15) ADDQ $0x20, R15 VMOVDQU Y4, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ BP JNZ mulAvxTwo_8x5Xor_loop VZEROUPPER mulAvxTwo_8x5Xor_end: RET // func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 107 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 mulAvxTwo_8x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x6_loop VZEROUPPER mulAvxTwo_8x6_end: RET // func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64(SB), $0-88 // Loading 24 of 48 tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 6 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs MOVQ (R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x6_64_loop VZEROUPPER mulGFNI_8x6_64_end: RET // func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x6(SB), $0-88 // Loading 8 of 48 tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x6_loop VZEROUPPER mulAvxGFNI_8x6_end: RET // func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 // Loading 24 of 48 tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x6_64Xor_loop: // Load 6 outputs MOVQ (R12), R14 VMOVDQU64 (R14)(R13*1), Z24 MOVQ 24(R12), R14 VMOVDQU64 (R14)(R13*1), Z25 MOVQ 48(R12), R14 VMOVDQU64 (R14)(R13*1), Z26 MOVQ 72(R12), R14 VMOVDQU64 (R14)(R13*1), Z27 MOVQ 96(R12), R14 VMOVDQU64 (R14)(R13*1), Z28 MOVQ 120(R12), R14 VMOVDQU64 (R14)(R13*1), Z29 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 6 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs MOVQ (R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x6_64Xor_loop VZEROUPPER mulGFNI_8x6_64Xor_end: RET // func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88 // Loading 8 of 48 tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x6Xor_loop: // Load 6 outputs MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y8 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y9 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y10 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y11 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y12 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x6Xor_loop VZEROUPPER mulAvxGFNI_8x6Xor_end: RET // func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 107 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 mulAvxTwo_8x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x6Xor_loop VZEROUPPER mulAvxTwo_8x6Xor_end: RET // func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 124 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X7 VPBROADCASTB X7, Y7 mulAvxTwo_8x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x7_loop VZEROUPPER mulAvxTwo_8x7_end: RET // func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64(SB), $0-88 // Loading 23 of 56 tables to registers // Destination kept on stack // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 7 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs MOVQ (R12), R14 VMOVDQU64 Z23, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x7_64_loop VZEROUPPER mulGFNI_8x7_64_end: RET // func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x7(SB), $0-88 // Loading 7 of 56 tables to registers // Destination kept on stack // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x7_loop VZEROUPPER mulAvxGFNI_8x7_end: RET // func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 // Loading 23 of 56 tables to registers // Destination kept on stack // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x7_64Xor_loop: // Load 7 outputs MOVQ (R12), R14 VMOVDQU64 (R14)(R13*1), Z23 MOVQ 24(R12), R14 VMOVDQU64 (R14)(R13*1), Z24 MOVQ 48(R12), R14 VMOVDQU64 (R14)(R13*1), Z25 MOVQ 72(R12), R14 VMOVDQU64 (R14)(R13*1), Z26 MOVQ 96(R12), R14 VMOVDQU64 (R14)(R13*1), Z27 MOVQ 120(R12), R14 VMOVDQU64 (R14)(R13*1), Z28 MOVQ 144(R12), R14 VMOVDQU64 (R14)(R13*1), Z29 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 7 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs MOVQ (R12), R14 VMOVDQU64 Z23, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x7_64Xor_loop VZEROUPPER mulGFNI_8x7_64Xor_end: RET // func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88 // Loading 7 of 56 tables to registers // Destination kept on stack // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x7Xor_loop: // Load 7 outputs MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y7 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y8 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y9 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y10 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y11 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y12 MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x7Xor_loop VZEROUPPER mulAvxGFNI_8x7Xor_end: RET // func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 124 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X7 VPBROADCASTB X7, Y7 mulAvxTwo_8x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x7Xor_loop VZEROUPPER mulAvxTwo_8x7Xor_end: RET // func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 141 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X8 VPBROADCASTB X8, Y8 mulAvxTwo_8x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x8_loop VZEROUPPER mulAvxTwo_8x8_end: RET // func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64(SB), $0-88 // Loading 22 of 64 tables to registers // Destination kept on stack // Full registers estimated 74 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 8 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R12), R14 VMOVDQU64 Z22, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z23, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x8_64_loop VZEROUPPER mulGFNI_8x8_64_end: RET // func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x8(SB), $0-88 // Loading 6 of 64 tables to registers // Destination kept on stack // Full registers estimated 74 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x8_loop VZEROUPPER mulAvxGFNI_8x8_end: RET // func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 // Loading 22 of 64 tables to registers // Destination kept on stack // Full registers estimated 74 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x8_64Xor_loop: // Load 8 outputs MOVQ (R12), R14 VMOVDQU64 (R14)(R13*1), Z22 MOVQ 24(R12), R14 VMOVDQU64 (R14)(R13*1), Z23 MOVQ 48(R12), R14 VMOVDQU64 (R14)(R13*1), Z24 MOVQ 72(R12), R14 VMOVDQU64 (R14)(R13*1), Z25 MOVQ 96(R12), R14 VMOVDQU64 (R14)(R13*1), Z26 MOVQ 120(R12), R14 VMOVDQU64 (R14)(R13*1), Z27 MOVQ 144(R12), R14 VMOVDQU64 (R14)(R13*1), Z28 MOVQ 168(R12), R14 VMOVDQU64 (R14)(R13*1), Z29 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 8 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R12), R14 VMOVDQU64 Z22, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z23, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x8_64Xor_loop VZEROUPPER mulGFNI_8x8_64Xor_end: RET // func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88 // Loading 6 of 64 tables to registers // Destination kept on stack // Full registers estimated 74 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x8Xor_loop: // Load 8 outputs MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y6 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y7 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y8 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y9 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y10 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y11 MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y12 MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x8Xor_loop VZEROUPPER mulAvxGFNI_8x8Xor_end: RET // func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 141 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X8 VPBROADCASTB X8, Y8 mulAvxTwo_8x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x8Xor_loop VZEROUPPER mulAvxTwo_8x8Xor_end: RET // func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 158 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X9 VPBROADCASTB X9, Y9 mulAvxTwo_8x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y8, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x9_loop VZEROUPPER mulAvxTwo_8x9_end: RET // func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64(SB), $0-88 // Loading 21 of 72 tables to registers // Destination kept on stack // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 9 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R12), R14 VMOVDQU64 Z21, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z22, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z23, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x9_64_loop VZEROUPPER mulGFNI_8x9_64_end: RET // func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x9(SB), $0-88 // Loading 5 of 72 tables to registers // Destination kept on stack // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x9_loop VZEROUPPER mulAvxGFNI_8x9_end: RET // func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 // Loading 21 of 72 tables to registers // Destination kept on stack // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x9_64Xor_loop: // Load 9 outputs MOVQ (R12), R14 VMOVDQU64 (R14)(R13*1), Z21 MOVQ 24(R12), R14 VMOVDQU64 (R14)(R13*1), Z22 MOVQ 48(R12), R14 VMOVDQU64 (R14)(R13*1), Z23 MOVQ 72(R12), R14 VMOVDQU64 (R14)(R13*1), Z24 MOVQ 96(R12), R14 VMOVDQU64 (R14)(R13*1), Z25 MOVQ 120(R12), R14 VMOVDQU64 (R14)(R13*1), Z26 MOVQ 144(R12), R14 VMOVDQU64 (R14)(R13*1), Z27 MOVQ 168(R12), R14 VMOVDQU64 (R14)(R13*1), Z28 MOVQ 192(R12), R14 VMOVDQU64 (R14)(R13*1), Z29 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 9 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R12), R14 VMOVDQU64 Z21, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z22, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z23, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x9_64Xor_loop VZEROUPPER mulGFNI_8x9_64Xor_end: RET // func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88 // Loading 5 of 72 tables to registers // Destination kept on stack // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x9Xor_loop: // Load 9 outputs MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y5 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y6 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y7 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y8 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y9 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y10 MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y11 MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y12 MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x9Xor_loop VZEROUPPER mulAvxGFNI_8x9Xor_end: RET // func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 158 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X9 VPBROADCASTB X9, Y9 mulAvxTwo_8x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y8, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x9Xor_loop VZEROUPPER mulAvxTwo_8x9Xor_end: RET // func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 175 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X10 VPBROADCASTB X10, Y10 mulAvxTwo_8x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 216(R12), R14 VMOVDQU Y9, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x10_loop VZEROUPPER mulAvxTwo_8x10_end: RET // func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64(SB), $0-88 // Loading 20 of 80 tables to registers // Destination kept on stack // Full registers estimated 92 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 10 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R12), R14 VMOVDQU64 Z20, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z21, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z22, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z23, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 216(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x10_64_loop VZEROUPPER mulGFNI_8x10_64_end: RET // func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x10(SB), $0-88 // Loading 4 of 80 tables to registers // Destination kept on stack // Full registers estimated 92 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 216(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x10_loop VZEROUPPER mulAvxGFNI_8x10_end: RET // func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 // Loading 20 of 80 tables to registers // Destination kept on stack // Full registers estimated 92 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_8x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulGFNI_8x10_64Xor_loop: // Load 10 outputs MOVQ (R12), R14 VMOVDQU64 (R14)(R13*1), Z20 MOVQ 24(R12), R14 VMOVDQU64 (R14)(R13*1), Z21 MOVQ 48(R12), R14 VMOVDQU64 (R14)(R13*1), Z22 MOVQ 72(R12), R14 VMOVDQU64 (R14)(R13*1), Z23 MOVQ 96(R12), R14 VMOVDQU64 (R14)(R13*1), Z24 MOVQ 120(R12), R14 VMOVDQU64 (R14)(R13*1), Z25 MOVQ 144(R12), R14 VMOVDQU64 (R14)(R13*1), Z26 MOVQ 168(R12), R14 VMOVDQU64 (R14)(R13*1), Z27 MOVQ 192(R12), R14 VMOVDQU64 (R14)(R13*1), Z28 MOVQ 216(R12), R14 VMOVDQU64 (R14)(R13*1), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 10 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R12), R14 VMOVDQU64 Z20, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU64 Z21, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU64 Z22, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU64 Z23, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU64 Z24, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU64 Z25, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU64 Z26, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU64 Z27, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU64 Z28, (R14)(R13*1) MOVQ 216(R12), R14 VMOVDQU64 Z29, (R14)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ AX JNZ mulGFNI_8x10_64Xor_loop VZEROUPPER mulGFNI_8x10_64Xor_end: RET // func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88 // Loading 4 of 80 tables to registers // Destination kept on stack // Full registers estimated 92 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_8x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX mulAvxGFNI_8x10Xor_loop: // Load 10 outputs MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y4 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y5 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y6 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y7 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y8 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y9 MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y10 MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y11 MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y12 MOVQ 216(R12), R14 VMOVDQU (R14)(R13*1), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y9, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y10, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y11, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y12, (R14)(R13*1) MOVQ 216(R12), R14 VMOVDQU Y13, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxGFNI_8x10Xor_loop VZEROUPPER mulAvxGFNI_8x10Xor_end: RET // func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 175 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X10 VPBROADCASTB X10, Y10 mulAvxTwo_8x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R12), R14 VMOVDQU (R14)(R13*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R12), R14 VMOVDQU (R14)(R13*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 216(R12), R14 VMOVDQU Y9, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x10Xor_loop VZEROUPPER mulAvxTwo_8x10Xor_end: RET // func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x1_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X2 VPBROADCASTB X2, Y2 mulAvxTwo_9x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y0 VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 ADDQ $0x40, R12 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x1_64_loop VZEROUPPER mulAvxTwo_9x1_64_end: RET // func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), CX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, CX mulGFNI_9x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z10 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z10, Z9 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z10 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z10 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z10 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z10 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (R9), Z10 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU64 (R10), Z10 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU64 (R11), Z10 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU64 (CX), Z10 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 VXORPD Z9, Z10, Z9 // Store 1 outputs VMOVDQU64 Z9, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulGFNI_9x1_64_loop VZEROUPPER mulGFNI_9x1_64_end: RET // func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), CX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, CX mulAvxGFNI_9x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 VXORPD Y9, Y10, Y9 // Store 1 outputs VMOVDQU Y9, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_9x1_loop VZEROUPPER mulAvxGFNI_9x1_end: RET // func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), CX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, CX mulGFNI_9x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (R12), Z9 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z10 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z10 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z10 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z10 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z10 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (R9), Z10 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU64 (R10), Z10 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU64 (R11), Z10 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 VXORPD Z9, Z10, Z9 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU64 (CX), Z10 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 VXORPD Z9, Z10, Z9 // Store 1 outputs VMOVDQU64 Z9, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulGFNI_9x1_64Xor_loop VZEROUPPER mulGFNI_9x1_64Xor_end: RET // func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), CX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, CX mulAvxGFNI_9x1Xor_loop: // Load 1 outputs VMOVDQU (R12), Y9 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 VXORPD Y9, Y10, Y9 // Store 1 outputs VMOVDQU Y9, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_9x1Xor_loop VZEROUPPER mulAvxGFNI_9x1Xor_end: RET // func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X2 VPBROADCASTB X2, Y2 mulAvxTwo_9x1_64Xor_loop: // Load 1 outputs VMOVDQU (R13), Y0 VMOVDQU 32(R13), Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 ADDQ $0x40, R12 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x1_64Xor_loop VZEROUPPER mulAvxTwo_9x1_64Xor_end: RET // func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 81 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X4 VPBROADCASTB X4, Y4 mulAvxTwo_9x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 ADDQ $0x40, R12 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) ADDQ $0x40, R14 VMOVDQU Y2, (R13) VMOVDQU Y3, 32(R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x2_64_loop VZEROUPPER mulAvxTwo_9x2_64_end: RET // func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), CX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, CX mulGFNI_9x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z20 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z20, Z18 VGF2P8AFFINEQB $0x00, Z1, Z20, Z19 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z20 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z20 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z20 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z20 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (R9), Z20 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU64 (R10), Z20 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU64 (R11), Z20 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU64 (CX), Z20 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 VXORPD Z19, Z21, Z19 // Store 2 outputs VMOVDQU64 Z18, (R13) ADDQ $0x40, R13 VMOVDQU64 Z19, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulGFNI_9x2_64_loop VZEROUPPER mulGFNI_9x2_64_end: RET // func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x2(SB), $0-88 // Loading 12 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, DX mulAvxGFNI_9x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_9x2_loop VZEROUPPER mulAvxGFNI_9x2_end: RET // func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), CX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, CX mulGFNI_9x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (R13), Z18 VMOVDQU64 (R12), Z19 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z20 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z20 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z20 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z20 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z20 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (R9), Z20 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU64 (R10), Z20 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU64 (R11), Z20 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU64 (CX), Z20 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 VXORPD Z18, Z21, Z18 VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 VXORPD Z19, Z21, Z19 // Store 2 outputs VMOVDQU64 Z18, (R13) ADDQ $0x40, R13 VMOVDQU64 Z19, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulGFNI_9x2_64Xor_loop VZEROUPPER mulGFNI_9x2_64Xor_end: RET // func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88 // Loading 12 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, DX mulAvxGFNI_9x2Xor_loop: // Load 2 outputs VMOVDQU (R14), Y12 VMOVDQU (R13), Y13 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R14) ADDQ $0x20, R14 VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_9x2Xor_loop VZEROUPPER mulAvxGFNI_9x2Xor_end: RET // func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 81 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X4 VPBROADCASTB X4, Y4 mulAvxTwo_9x2_64Xor_loop: // Load 2 outputs VMOVDQU (R14), Y0 VMOVDQU 32(R14), Y1 VMOVDQU (R13), Y2 VMOVDQU 32(R13), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 ADDQ $0x40, R12 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) ADDQ $0x40, R14 VMOVDQU Y2, (R13) VMOVDQU Y3, 32(R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x2_64Xor_loop VZEROUPPER mulAvxTwo_9x2_64Xor_end: RET // func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 118 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 mulAvxTwo_9x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 ADDQ $0x40, R12 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) ADDQ $0x40, R14 VMOVDQU Y2, (R15) VMOVDQU Y3, 32(R15) ADDQ $0x40, R15 VMOVDQU Y4, (R13) VMOVDQU Y5, 32(R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x3_64_loop VZEROUPPER mulAvxTwo_9x3_64_end: RET // func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x3_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 VBROADCASTF32X2 208(CX), Z26 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), CX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R12 // Add start offset to input ADDQ R15, DX ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, CX mulGFNI_9x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU64 (CX), Z30 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 VXORPD Z29, Z31, Z29 // Store 3 outputs VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulGFNI_9x3_64_loop VZEROUPPER mulGFNI_9x3_64_end: RET // func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x3(SB), $8-88 // Loading 11 of 27 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, DX mulAvxGFNI_9x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_9x3_loop VZEROUPPER mulAvxGFNI_9x3_end: RET // func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 VBROADCASTF32X2 208(CX), Z26 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), CX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R12 // Add start offset to input ADDQ R15, DX ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, CX mulGFNI_9x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (R13), Z27 VMOVDQU64 (R14), Z28 VMOVDQU64 (R12), Z29 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU64 (CX), Z30 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 VXORPD Z29, Z31, Z29 // Store 3 outputs VMOVDQU64 Z27, (R13) ADDQ $0x40, R13 VMOVDQU64 Z28, (R14) ADDQ $0x40, R14 VMOVDQU64 Z29, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ AX JNZ mulGFNI_9x3_64Xor_loop VZEROUPPER mulGFNI_9x3_64Xor_end: RET // func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88 // Loading 11 of 27 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, DX mulAvxGFNI_9x3Xor_loop: // Load 3 outputs VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R13), Y13 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_9x3Xor_loop VZEROUPPER mulAvxGFNI_9x3Xor_end: RET // func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 118 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 mulAvxTwo_9x3_64Xor_loop: // Load 3 outputs VMOVDQU (R14), Y0 VMOVDQU 32(R14), Y1 VMOVDQU (R15), Y2 VMOVDQU 32(R15), Y3 VMOVDQU (R13), Y4 VMOVDQU 32(R13), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 ADDQ $0x40, R12 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) ADDQ $0x40, R14 VMOVDQU Y2, (R15) VMOVDQU Y3, 32(R15) ADDQ $0x40, R15 VMOVDQU Y4, (R13) VMOVDQU Y5, 32(R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x3_64Xor_loop VZEROUPPER mulAvxTwo_9x3_64Xor_end: RET // func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 81 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x4_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_9x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 ADDQ $0x20, AX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2048(CX), Y5 VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 VMOVDQU Y1, (R14) ADDQ $0x20, R14 VMOVDQU Y2, (R15) ADDQ $0x20, R15 VMOVDQU Y3, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ BP JNZ mulAvxTwo_9x4_loop VZEROUPPER mulAvxTwo_9x4_end: RET // func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x4_64(SB), $8-88 // Loading 26 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_9x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 4 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 4 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 4 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 4 outputs VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ BP JNZ mulGFNI_9x4_64_loop VZEROUPPER mulGFNI_9x4_64_end: RET // func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x4(SB), $8-88 // Loading 10 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_9x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_9x4_loop VZEROUPPER mulAvxGFNI_9x4_end: RET // func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 // Loading 26 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_9x4_64Xor_loop: // Load 4 outputs VMOVDQU64 (R13), Z26 VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R12), Z29 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 4 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 4 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 4 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 4 outputs VMOVDQU64 Z26, (R13) ADDQ $0x40, R13 VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R12) ADDQ $0x40, R12 // Prepare for next loop DECQ BP JNZ mulGFNI_9x4_64Xor_loop VZEROUPPER mulGFNI_9x4_64Xor_end: RET // func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88 // Loading 10 of 36 tables to registers // Destination kept in GP registers // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_9x4Xor_loop: // Load 4 outputs VMOVDQU (R13), Y10 VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R12), Y13 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs VMOVDQU Y10, (R13) ADDQ $0x20, R13 VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_9x4Xor_loop VZEROUPPER mulAvxGFNI_9x4Xor_end: RET // func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 81 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x4Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_9x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (R13), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R15), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 ADDQ $0x20, AX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2048(CX), Y5 VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 VMOVDQU Y1, (R14) ADDQ $0x20, R14 VMOVDQU Y2, (R15) ADDQ $0x20, R15 VMOVDQU Y3, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ BP JNZ mulAvxTwo_9x4Xor_loop VZEROUPPER mulAvxTwo_9x4Xor_end: RET // func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 100 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X5 VPBROADCASTB X5, Y5 mulAvxTwo_9x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2560(CX), Y6 VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x5_loop VZEROUPPER mulAvxTwo_9x5_end: RET // func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64(SB), $0-88 // Loading 25 of 45 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 5 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 5 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs MOVQ (R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x5_64_loop VZEROUPPER mulGFNI_9x5_64_end: RET // func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x5(SB), $0-88 // Loading 9 of 45 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x5_loop VZEROUPPER mulAvxGFNI_9x5_end: RET // func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 // Loading 25 of 45 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x5_64Xor_loop: // Load 5 outputs MOVQ (R13), R15 VMOVDQU64 (R15)(R14*1), Z25 MOVQ 24(R13), R15 VMOVDQU64 (R15)(R14*1), Z26 MOVQ 48(R13), R15 VMOVDQU64 (R15)(R14*1), Z27 MOVQ 72(R13), R15 VMOVDQU64 (R15)(R14*1), Z28 MOVQ 96(R13), R15 VMOVDQU64 (R15)(R14*1), Z29 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 5 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 5 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs MOVQ (R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x5_64Xor_loop VZEROUPPER mulGFNI_9x5_64Xor_end: RET // func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88 // Loading 9 of 45 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x5Xor_loop: // Load 5 outputs MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y9 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y10 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y11 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y12 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x5Xor_loop VZEROUPPER mulAvxGFNI_9x5Xor_end: RET // func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 100 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X5 VPBROADCASTB X5, Y5 mulAvxTwo_9x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2560(CX), Y6 VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x5Xor_loop VZEROUPPER mulAvxTwo_9x5Xor_end: RET // func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 119 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 mulAvxTwo_9x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3072(CX), Y7 VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x6_loop VZEROUPPER mulAvxTwo_9x6_end: RET // func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64(SB), $0-88 // Loading 24 of 54 tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 6 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 6 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs MOVQ (R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x6_64_loop VZEROUPPER mulGFNI_9x6_64_end: RET // func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x6(SB), $0-88 // Loading 8 of 54 tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x6_loop VZEROUPPER mulAvxGFNI_9x6_end: RET // func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 // Loading 24 of 54 tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x6_64Xor_loop: // Load 6 outputs MOVQ (R13), R15 VMOVDQU64 (R15)(R14*1), Z24 MOVQ 24(R13), R15 VMOVDQU64 (R15)(R14*1), Z25 MOVQ 48(R13), R15 VMOVDQU64 (R15)(R14*1), Z26 MOVQ 72(R13), R15 VMOVDQU64 (R15)(R14*1), Z27 MOVQ 96(R13), R15 VMOVDQU64 (R15)(R14*1), Z28 MOVQ 120(R13), R15 VMOVDQU64 (R15)(R14*1), Z29 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 6 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 6 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs MOVQ (R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x6_64Xor_loop VZEROUPPER mulGFNI_9x6_64Xor_end: RET // func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88 // Loading 8 of 54 tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x6Xor_loop: // Load 6 outputs MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y8 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y9 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y10 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y11 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y12 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x6Xor_loop VZEROUPPER mulAvxGFNI_9x6Xor_end: RET // func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 119 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 mulAvxTwo_9x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3072(CX), Y7 VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x6Xor_loop VZEROUPPER mulAvxTwo_9x6Xor_end: RET // func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 138 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X7 VPBROADCASTB X7, Y7 mulAvxTwo_9x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3584(CX), Y8 VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x7_loop VZEROUPPER mulAvxTwo_9x7_end: RET // func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64(SB), $0-88 // Loading 23 of 63 tables to registers // Destination kept on stack // Full registers estimated 72 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 7 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 7 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs MOVQ (R13), R15 VMOVDQU64 Z23, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x7_64_loop VZEROUPPER mulGFNI_9x7_64_end: RET // func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x7(SB), $0-88 // Loading 7 of 63 tables to registers // Destination kept on stack // Full registers estimated 72 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x7_loop VZEROUPPER mulAvxGFNI_9x7_end: RET // func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 // Loading 23 of 63 tables to registers // Destination kept on stack // Full registers estimated 72 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x7_64Xor_loop: // Load 7 outputs MOVQ (R13), R15 VMOVDQU64 (R15)(R14*1), Z23 MOVQ 24(R13), R15 VMOVDQU64 (R15)(R14*1), Z24 MOVQ 48(R13), R15 VMOVDQU64 (R15)(R14*1), Z25 MOVQ 72(R13), R15 VMOVDQU64 (R15)(R14*1), Z26 MOVQ 96(R13), R15 VMOVDQU64 (R15)(R14*1), Z27 MOVQ 120(R13), R15 VMOVDQU64 (R15)(R14*1), Z28 MOVQ 144(R13), R15 VMOVDQU64 (R15)(R14*1), Z29 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 7 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 7 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs MOVQ (R13), R15 VMOVDQU64 Z23, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x7_64Xor_loop VZEROUPPER mulGFNI_9x7_64Xor_end: RET // func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88 // Loading 7 of 63 tables to registers // Destination kept on stack // Full registers estimated 72 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x7Xor_loop: // Load 7 outputs MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y7 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y8 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y9 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y10 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y11 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y12 MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x7Xor_loop VZEROUPPER mulAvxGFNI_9x7Xor_end: RET // func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 138 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X7 VPBROADCASTB X7, Y7 mulAvxTwo_9x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3584(CX), Y8 VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x7Xor_loop VZEROUPPER mulAvxTwo_9x7Xor_end: RET // func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 157 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X8 VPBROADCASTB X8, Y8 mulAvxTwo_9x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4096(CX), Y9 VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x8_loop VZEROUPPER mulAvxTwo_9x8_end: RET // func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64(SB), $0-88 // Loading 22 of 72 tables to registers // Destination kept on stack // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 8 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 8 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R13), R15 VMOVDQU64 Z22, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z23, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x8_64_loop VZEROUPPER mulGFNI_9x8_64_end: RET // func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x8(SB), $0-88 // Loading 6 of 72 tables to registers // Destination kept on stack // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x8_loop VZEROUPPER mulAvxGFNI_9x8_end: RET // func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 // Loading 22 of 72 tables to registers // Destination kept on stack // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x8_64Xor_loop: // Load 8 outputs MOVQ (R13), R15 VMOVDQU64 (R15)(R14*1), Z22 MOVQ 24(R13), R15 VMOVDQU64 (R15)(R14*1), Z23 MOVQ 48(R13), R15 VMOVDQU64 (R15)(R14*1), Z24 MOVQ 72(R13), R15 VMOVDQU64 (R15)(R14*1), Z25 MOVQ 96(R13), R15 VMOVDQU64 (R15)(R14*1), Z26 MOVQ 120(R13), R15 VMOVDQU64 (R15)(R14*1), Z27 MOVQ 144(R13), R15 VMOVDQU64 (R15)(R14*1), Z28 MOVQ 168(R13), R15 VMOVDQU64 (R15)(R14*1), Z29 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 8 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 8 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R13), R15 VMOVDQU64 Z22, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z23, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x8_64Xor_loop VZEROUPPER mulGFNI_9x8_64Xor_end: RET // func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88 // Loading 6 of 72 tables to registers // Destination kept on stack // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x8Xor_loop: // Load 8 outputs MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y6 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y7 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y8 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y9 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y10 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y11 MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y12 MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x8Xor_loop VZEROUPPER mulAvxGFNI_9x8Xor_end: RET // func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 157 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X8 VPBROADCASTB X8, Y8 mulAvxTwo_9x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4096(CX), Y9 VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x8Xor_loop VZEROUPPER mulAvxTwo_9x8Xor_end: RET // func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 176 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X9 VPBROADCASTB X9, Y9 mulAvxTwo_9x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4608(CX), Y10 VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y8, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x9_loop VZEROUPPER mulAvxTwo_9x9_end: RET // func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64(SB), $0-88 // Loading 21 of 81 tables to registers // Destination kept on stack // Full registers estimated 92 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 9 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 9 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R13), R15 VMOVDQU64 Z21, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z22, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z23, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x9_64_loop VZEROUPPER mulGFNI_9x9_64_end: RET // func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x9(SB), $0-88 // Loading 5 of 81 tables to registers // Destination kept on stack // Full registers estimated 92 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 640(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x9_loop VZEROUPPER mulAvxGFNI_9x9_end: RET // func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 // Loading 21 of 81 tables to registers // Destination kept on stack // Full registers estimated 92 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x9_64Xor_loop: // Load 9 outputs MOVQ (R13), R15 VMOVDQU64 (R15)(R14*1), Z21 MOVQ 24(R13), R15 VMOVDQU64 (R15)(R14*1), Z22 MOVQ 48(R13), R15 VMOVDQU64 (R15)(R14*1), Z23 MOVQ 72(R13), R15 VMOVDQU64 (R15)(R14*1), Z24 MOVQ 96(R13), R15 VMOVDQU64 (R15)(R14*1), Z25 MOVQ 120(R13), R15 VMOVDQU64 (R15)(R14*1), Z26 MOVQ 144(R13), R15 VMOVDQU64 (R15)(R14*1), Z27 MOVQ 168(R13), R15 VMOVDQU64 (R15)(R14*1), Z28 MOVQ 192(R13), R15 VMOVDQU64 (R15)(R14*1), Z29 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 9 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 9 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R13), R15 VMOVDQU64 Z21, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z22, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z23, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x9_64Xor_loop VZEROUPPER mulGFNI_9x9_64Xor_end: RET // func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88 // Loading 5 of 81 tables to registers // Destination kept on stack // Full registers estimated 92 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x9Xor_loop: // Load 9 outputs MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y5 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y6 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y7 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y8 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y9 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y10 MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y11 MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y12 MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 640(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x9Xor_loop VZEROUPPER mulAvxGFNI_9x9Xor_end: RET // func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 176 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X9 VPBROADCASTB X9, Y9 mulAvxTwo_9x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4608(CX), Y10 VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y8, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x9Xor_loop VZEROUPPER mulAvxTwo_9x9Xor_end: RET // func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 195 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X10 VPBROADCASTB X10, Y10 mulAvxTwo_9x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5120(CX), Y11 VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 216(R13), R15 VMOVDQU Y9, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x10_loop VZEROUPPER mulAvxTwo_9x10_end: RET // func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64(SB), $0-88 // Loading 20 of 90 tables to registers // Destination kept on stack // Full registers estimated 102 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 10 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 10 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R13), R15 VMOVDQU64 Z20, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z21, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z22, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z23, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 216(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x10_64_loop VZEROUPPER mulGFNI_9x10_64_end: RET // func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x10(SB), $0-88 // Loading 4 of 90 tables to registers // Destination kept on stack // Full registers estimated 102 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 640(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 648(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 656(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 664(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 672(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 680(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 688(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 696(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 704(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 712(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 216(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x10_loop VZEROUPPER mulAvxGFNI_9x10_end: RET // func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 // Loading 20 of 90 tables to registers // Destination kept on stack // Full registers estimated 102 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_9x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulGFNI_9x10_64Xor_loop: // Load 10 outputs MOVQ (R13), R15 VMOVDQU64 (R15)(R14*1), Z20 MOVQ 24(R13), R15 VMOVDQU64 (R15)(R14*1), Z21 MOVQ 48(R13), R15 VMOVDQU64 (R15)(R14*1), Z22 MOVQ 72(R13), R15 VMOVDQU64 (R15)(R14*1), Z23 MOVQ 96(R13), R15 VMOVDQU64 (R15)(R14*1), Z24 MOVQ 120(R13), R15 VMOVDQU64 (R15)(R14*1), Z25 MOVQ 144(R13), R15 VMOVDQU64 (R15)(R14*1), Z26 MOVQ 168(R13), R15 VMOVDQU64 (R15)(R14*1), Z27 MOVQ 192(R13), R15 VMOVDQU64 (R15)(R14*1), Z28 MOVQ 216(R13), R15 VMOVDQU64 (R15)(R14*1), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 10 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 10 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R13), R15 VMOVDQU64 Z20, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU64 Z21, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU64 Z22, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU64 Z23, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU64 Z24, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU64 Z25, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU64 Z26, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU64 Z27, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU64 Z28, (R15)(R14*1) MOVQ 216(R13), R15 VMOVDQU64 Z29, (R15)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ AX JNZ mulGFNI_9x10_64Xor_loop VZEROUPPER mulGFNI_9x10_64Xor_end: RET // func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88 // Loading 4 of 90 tables to registers // Destination kept on stack // Full registers estimated 102 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_9x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX mulAvxGFNI_9x10Xor_loop: // Load 10 outputs MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y4 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y5 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y6 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y7 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y8 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y9 MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y10 MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y11 MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y12 MOVQ 216(R13), R15 VMOVDQU (R15)(R14*1), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 640(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 648(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 656(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 664(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 672(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 680(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 688(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 696(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 704(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 712(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y9, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y10, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y11, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y12, (R15)(R14*1) MOVQ 216(R13), R15 VMOVDQU Y13, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxGFNI_9x10Xor_loop VZEROUPPER mulAvxGFNI_9x10Xor_end: RET // func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 195 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X10 VPBROADCASTB X10, Y10 mulAvxTwo_9x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R13), R15 VMOVDQU (R15)(R14*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R13), R15 VMOVDQU (R15)(R14*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5120(CX), Y11 VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 216(R13), R15 VMOVDQU Y9, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x10Xor_loop VZEROUPPER mulAvxTwo_9x10Xor_end: RET // func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x1_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ (R14), R14 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X2 VPBROADCASTB X2, Y2 mulAvxTwo_10x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y0 VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 ADDQ $0x40, R12 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R13), Y6 VMOVDQU 32(R13), Y5 ADDQ $0x40, R13 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) ADDQ $0x40, R14 // Prepare for next loop DECQ AX JNZ mulAvxTwo_10x1_64_loop VZEROUPPER mulAvxTwo_10x1_64_end: RET // func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 13 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x1_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), R12 MOVQ 216(CX), CX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, CX mulGFNI_10x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z11 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z11, Z10 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z11 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z11 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z11 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z11 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (R9), Z11 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU64 (R10), Z11 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU64 (R11), Z11 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU64 (R12), Z11 ADDQ $0x40, R12 VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU64 (CX), Z11 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 VXORPD Z10, Z11, Z10 // Store 1 outputs VMOVDQU64 Z10, (R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulGFNI_10x1_64_loop VZEROUPPER mulGFNI_10x1_64_end: RET // func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x1(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 13 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x1_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), R12 MOVQ 216(CX), CX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, CX mulAvxGFNI_10x1_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (CX), Y11 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 VXORPD Y10, Y11, Y10 // Store 1 outputs VMOVDQU Y10, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_10x1_loop VZEROUPPER mulAvxGFNI_10x1_end: RET // func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 13 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x1_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), R12 MOVQ 216(CX), CX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, CX mulGFNI_10x1_64Xor_loop: // Load 1 outputs VMOVDQU64 (R13), Z10 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z11 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z11 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU64 (SI), Z11 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU64 (DI), Z11 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU64 (R8), Z11 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU64 (R9), Z11 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU64 (R10), Z11 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU64 (R11), Z11 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU64 (R12), Z11 ADDQ $0x40, R12 VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 VXORPD Z10, Z11, Z10 // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU64 (CX), Z11 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 VXORPD Z10, Z11, Z10 // Store 1 outputs VMOVDQU64 Z10, (R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulGFNI_10x1_64Xor_loop VZEROUPPER mulGFNI_10x1_64Xor_end: RET // func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 13 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x1Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), R12 MOVQ 216(CX), CX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, CX mulAvxGFNI_10x1Xor_loop: // Load 1 outputs VMOVDQU (R13), Y10 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (CX), Y11 ADDQ $0x20, CX VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 VXORPD Y10, Y11, Y10 // Store 1 outputs VMOVDQU Y10, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_10x1Xor_loop VZEROUPPER mulAvxGFNI_10x1Xor_end: RET // func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ (R14), R14 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X2 VPBROADCASTB X2, Y2 mulAvxTwo_10x1_64Xor_loop: // Load 1 outputs VMOVDQU (R14), Y0 VMOVDQU 32(R14), Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 ADDQ $0x40, R12 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R13), Y6 VMOVDQU 32(R13), Y5 ADDQ $0x40, R13 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 XOR3WAY( $0x00, Y3, Y4, Y0) XOR3WAY( $0x00, Y5, Y6, Y1) // Store 1 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) ADDQ $0x40, R14 // Prepare for next loop DECQ AX JNZ mulAvxTwo_10x1_64Xor_loop VZEROUPPER mulAvxTwo_10x1_64Xor_end: RET // func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x2_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R15 ADDQ BP, R14 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 mulAvxTwo_10x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y0 VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y2 VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 ADDQ $0x40, R12 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R13), Y9 VMOVDQU 32(R13), Y11 ADDQ $0x40, R13 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R15) VMOVDQU Y1, 32(R15) ADDQ $0x40, R15 VMOVDQU Y2, (R14) VMOVDQU Y3, 32(R14) ADDQ $0x40, R14 // Prepare for next loop DECQ AX JNZ mulAvxTwo_10x2_64_loop VZEROUPPER mulAvxTwo_10x2_64_end: RET // func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 24 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x2_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), R12 MOVQ 216(CX), CX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input ADDQ R15, DX ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, CX mulGFNI_10x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z22 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z22, Z20 VGF2P8AFFINEQB $0x00, Z1, Z22, Z21 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z22 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z22 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z22 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z22 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (R9), Z22 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU64 (R10), Z22 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU64 (R11), Z22 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU64 (R12), Z22 ADDQ $0x40, R12 VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU64 (CX), Z22 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 VXORPD Z21, Z23, Z21 // Store 2 outputs VMOVDQU64 Z20, (R14) ADDQ $0x40, R14 VMOVDQU64 Z21, (R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulGFNI_10x2_64_loop VZEROUPPER mulGFNI_10x2_64_end: RET // func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x2(SB), $8-88 // Loading 12 of 20 tables to registers // Destination kept in GP registers // Full registers estimated 24 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x2_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R15 ADDQ BP, R14 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, DX mulAvxGFNI_10x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R14) ADDQ $0x20, R14 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_10x2_loop VZEROUPPER mulAvxGFNI_10x2_end: RET // func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 24 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x2_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), R9 MOVQ 144(CX), R10 MOVQ 168(CX), R11 MOVQ 192(CX), R12 MOVQ 216(CX), CX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input ADDQ R15, DX ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, CX mulGFNI_10x2_64Xor_loop: // Load 2 outputs VMOVDQU64 (R14), Z20 VMOVDQU64 (R13), Z21 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z22 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z1, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z22 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU64 (SI), Z22 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU64 (DI), Z22 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU64 (R8), Z22 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU64 (R9), Z22 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU64 (R10), Z22 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU64 (R11), Z22 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU64 (R12), Z22 ADDQ $0x40, R12 VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 VXORPD Z21, Z23, Z21 // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU64 (CX), Z22 ADDQ $0x40, CX VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 VXORPD Z20, Z23, Z20 VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 VXORPD Z21, Z23, Z21 // Store 2 outputs VMOVDQU64 Z20, (R14) ADDQ $0x40, R14 VMOVDQU64 Z21, (R13) ADDQ $0x40, R13 // Prepare for next loop DECQ AX JNZ mulGFNI_10x2_64Xor_loop VZEROUPPER mulGFNI_10x2_64Xor_end: RET // func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88 // Loading 12 of 20 tables to registers // Destination kept in GP registers // Full registers estimated 24 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x2Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 VBROADCASTSD 88(CX), Y11 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R15 ADDQ BP, R14 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, DX mulAvxGFNI_10x2Xor_loop: // Load 2 outputs VMOVDQU (R15), Y12 VMOVDQU (R14), Y13 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 2 outputs VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R14) ADDQ $0x20, R14 // Prepare for next loop DECQ AX JNZ mulAvxGFNI_10x2Xor_loop VZEROUPPER mulAvxGFNI_10x2Xor_end: RET // func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R15 ADDQ BP, R14 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 mulAvxTwo_10x2_64Xor_loop: // Load 2 outputs VMOVDQU (R15), Y0 VMOVDQU 32(R15), Y1 VMOVDQU (R14), Y2 VMOVDQU 32(R14), Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 ADDQ $0x40, R12 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R13), Y9 VMOVDQU 32(R13), Y11 ADDQ $0x40, R13 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) XOR3WAY( $0x00, Y7, Y8, Y3) // Store 2 outputs VMOVDQU Y0, (R15) VMOVDQU Y1, 32(R15) ADDQ $0x40, R15 VMOVDQU Y2, (R14) VMOVDQU Y3, 32(R14) ADDQ $0x40, R14 // Prepare for next loop DECQ AX JNZ mulAvxTwo_10x2_64Xor_loop VZEROUPPER mulAvxTwo_10x2_64Xor_end: RET // func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 130 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulAvxTwo_10x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y0 VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y2 VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y4 VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 ADDQ $0x40, R12 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) ADDQ $0x40, R14 VMOVDQU Y2, (R15) VMOVDQU Y3, 32(R15) ADDQ $0x40, R15 VMOVDQU Y4, (R13) VMOVDQU Y5, 32(R13) ADDQ $0x40, R13 // Prepare for next loop DECQ BP JNZ mulAvxTwo_10x3_64_loop VZEROUPPER mulAvxTwo_10x3_64_end: RET // func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x3_64(SB), $8-88 // Loading 27 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x3_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 VBROADCASTF32X2 208(CX), Z26 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_10x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 3 outputs VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R13) ADDQ $0x40, R13 // Prepare for next loop DECQ BP JNZ mulGFNI_10x3_64_loop VZEROUPPER mulGFNI_10x3_64_end: RET // func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x3(SB), $8-88 // Loading 11 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x3_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_10x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_10x3_loop VZEROUPPER mulAvxGFNI_10x3_end: RET // func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 // Loading 27 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x3_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 VBROADCASTF32X2 208(CX), Z26 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulGFNI_10x3_64Xor_loop: // Load 3 outputs VMOVDQU64 (R14), Z27 VMOVDQU64 (R15), Z28 VMOVDQU64 (R13), Z29 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU64 (AX), Z30 ADDQ $0x40, AX VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 3 outputs VMOVDQU64 Z27, (R14) ADDQ $0x40, R14 VMOVDQU64 Z28, (R15) ADDQ $0x40, R15 VMOVDQU64 Z29, (R13) ADDQ $0x40, R13 // Prepare for next loop DECQ BP JNZ mulGFNI_10x3_64Xor_loop VZEROUPPER mulGFNI_10x3_64Xor_end: RET // func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88 // Loading 11 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x3Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 VBROADCASTSD 80(CX), Y10 MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, AX // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxGFNI_10x3Xor_loop: // Load 3 outputs VMOVDQU (R14), Y11 VMOVDQU (R15), Y12 VMOVDQU (R13), Y13 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y14 ADDQ $0x20, AX VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 3 outputs VMOVDQU Y11, (R14) ADDQ $0x20, R14 VMOVDQU Y12, (R15) ADDQ $0x20, R15 VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ BP JNZ mulAvxGFNI_10x3Xor_loop VZEROUPPER mulAvxGFNI_10x3Xor_end: RET // func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 130 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x3_64Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 // Reload length to save a register MOVQ n+80(FP), BP SHRQ $0x06, BP mulAvxTwo_10x3_64Xor_loop: // Load 3 outputs VMOVDQU (R14), Y0 VMOVDQU 32(R14), Y1 VMOVDQU (R15), Y2 VMOVDQU 32(R15), Y3 VMOVDQU (R13), Y4 VMOVDQU 32(R13), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 ADDQ $0x40, R12 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) XOR3WAY( $0x00, Y9, Y10, Y5) // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) ADDQ $0x40, R14 VMOVDQU Y2, (R15) VMOVDQU Y3, 32(R15) ADDQ $0x40, R15 VMOVDQU Y4, (R13) VMOVDQU Y5, 32(R13) ADDQ $0x40, R13 // Prepare for next loop DECQ BP JNZ mulAvxTwo_10x3_64Xor_loop VZEROUPPER mulAvxTwo_10x3_64Xor_end: RET // func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 mulAvxTwo_10x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 ADDQ $0x20, R12 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 ADDQ $0x20, R13 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2048(CX), Y5 VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2304(CX), Y5 VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x4_loop VZEROUPPER mulAvxTwo_10x4_end: RET // func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x4_64(SB), $8-88 // Loading 26 of 40 tables to registers // Destination kept on stack // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x4_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x4_64_loop: // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 4 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 4 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 4 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 4 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 4 outputs MOVQ (R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x4_64_loop VZEROUPPER mulGFNI_10x4_64_end: RET // func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x4(SB), $8-88 // Loading 10 of 40 tables to registers // Destination kept on stack // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x4_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x4_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs MOVQ (R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x4_loop VZEROUPPER mulAvxGFNI_10x4_end: RET // func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 // Loading 26 of 40 tables to registers // Destination kept on stack // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x4_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 VBROADCASTF32X2 200(CX), Z25 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x4_64Xor_loop: // Load 4 outputs MOVQ (R14), BP VMOVDQU64 (BP)(R15*1), Z26 MOVQ 24(R14), BP VMOVDQU64 (BP)(R15*1), Z27 MOVQ 48(R14), BP VMOVDQU64 (BP)(R15*1), Z28 MOVQ 72(R14), BP VMOVDQU64 (BP)(R15*1), Z29 // Load and process 64 bytes from input 0 to 4 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 4 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 4 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 4 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 4 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 4 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 4 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 4 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 4 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 4 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 4 outputs MOVQ (R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x4_64Xor_loop VZEROUPPER mulGFNI_10x4_64Xor_end: RET // func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88 // Loading 10 of 40 tables to registers // Destination kept on stack // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x4Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 VBROADCASTSD 72(CX), Y9 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x4Xor_loop: // Load 4 outputs MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y10 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y11 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y12 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y13 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 4 outputs MOVQ (R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x4Xor_loop VZEROUPPER mulAvxGFNI_10x4Xor_end: RET // func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 mulAvxTwo_10x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 ADDQ $0x20, R12 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 ADDQ $0x20, R13 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2048(CX), Y5 VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2304(CX), Y5 VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y3) // Store 4 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x4Xor_loop VZEROUPPER mulAvxTwo_10x4Xor_end: RET // func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 110 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 mulAvxTwo_10x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 ADDQ $0x20, R13 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2560(CX), Y6 VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2880(CX), Y6 VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x5_loop VZEROUPPER mulAvxTwo_10x5_end: RET // func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64(SB), $8-88 // Loading 25 of 50 tables to registers // Destination kept on stack // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x5_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x5_64_loop: // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 5 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 5 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 5 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs MOVQ (R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x5_64_loop VZEROUPPER mulGFNI_10x5_64_end: RET // func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x5(SB), $8-88 // Loading 9 of 50 tables to registers // Destination kept on stack // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x5_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x5_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs MOVQ (R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x5_loop VZEROUPPER mulAvxGFNI_10x5_end: RET // func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 // Loading 25 of 50 tables to registers // Destination kept on stack // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x5_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 VBROADCASTF32X2 192(CX), Z24 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x5_64Xor_loop: // Load 5 outputs MOVQ (R14), BP VMOVDQU64 (BP)(R15*1), Z25 MOVQ 24(R14), BP VMOVDQU64 (BP)(R15*1), Z26 MOVQ 48(R14), BP VMOVDQU64 (BP)(R15*1), Z27 MOVQ 72(R14), BP VMOVDQU64 (BP)(R15*1), Z28 MOVQ 96(R14), BP VMOVDQU64 (BP)(R15*1), Z29 // Load and process 64 bytes from input 0 to 5 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 5 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 5 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 5 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 5 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 5 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 5 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 5 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 5 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 5 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 5 outputs MOVQ (R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x5_64Xor_loop VZEROUPPER mulGFNI_10x5_64Xor_end: RET // func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88 // Loading 9 of 50 tables to registers // Destination kept on stack // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x5Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 VBROADCASTSD 64(CX), Y8 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x5Xor_loop: // Load 5 outputs MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y9 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y10 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y11 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y12 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y13 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 5 outputs MOVQ (R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x5Xor_loop VZEROUPPER mulAvxGFNI_10x5Xor_end: RET // func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 110 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 mulAvxTwo_10x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 ADDQ $0x20, R13 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2560(CX), Y6 VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2880(CX), Y6 VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y4) // Store 5 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x5Xor_loop VZEROUPPER mulAvxTwo_10x5Xor_end: RET // func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 131 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 mulAvxTwo_10x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 ADDQ $0x20, R13 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3072(CX), Y7 VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3456(CX), Y7 VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x6_loop VZEROUPPER mulAvxTwo_10x6_end: RET // func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64(SB), $8-88 // Loading 24 of 60 tables to registers // Destination kept on stack // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x6_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x6_64_loop: // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 6 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 6 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 6 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs MOVQ (R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x6_64_loop VZEROUPPER mulGFNI_10x6_64_end: RET // func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x6(SB), $8-88 // Loading 8 of 60 tables to registers // Destination kept on stack // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x6_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x6_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs MOVQ (R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x6_loop VZEROUPPER mulAvxGFNI_10x6_end: RET // func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 // Loading 24 of 60 tables to registers // Destination kept on stack // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x6_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 VBROADCASTF32X2 184(CX), Z23 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x6_64Xor_loop: // Load 6 outputs MOVQ (R14), BP VMOVDQU64 (BP)(R15*1), Z24 MOVQ 24(R14), BP VMOVDQU64 (BP)(R15*1), Z25 MOVQ 48(R14), BP VMOVDQU64 (BP)(R15*1), Z26 MOVQ 72(R14), BP VMOVDQU64 (BP)(R15*1), Z27 MOVQ 96(R14), BP VMOVDQU64 (BP)(R15*1), Z28 MOVQ 120(R14), BP VMOVDQU64 (BP)(R15*1), Z29 // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 6 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 6 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 6 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 6 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 6 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 6 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 6 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 6 outputs MOVQ (R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x6_64Xor_loop VZEROUPPER mulGFNI_10x6_64Xor_end: RET // func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88 // Loading 8 of 60 tables to registers // Destination kept on stack // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x6Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 VBROADCASTSD 56(CX), Y7 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x6Xor_loop: // Load 6 outputs MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y8 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y9 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y10 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y11 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y12 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y13 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 6 outputs MOVQ (R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x6Xor_loop VZEROUPPER mulAvxGFNI_10x6Xor_end: RET // func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 131 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 mulAvxTwo_10x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 ADDQ $0x20, R13 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3072(CX), Y7 VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3456(CX), Y7 VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) // Store 6 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x6Xor_loop VZEROUPPER mulAvxTwo_10x6Xor_end: RET // func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 152 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 mulAvxTwo_10x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 ADDQ $0x20, R13 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3584(CX), Y8 VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 4032(CX), Y8 VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x7_loop VZEROUPPER mulAvxTwo_10x7_end: RET // func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64(SB), $8-88 // Loading 23 of 70 tables to registers // Destination kept on stack // Full registers estimated 79 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x7_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x7_64_loop: // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 7 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 7 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 7 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs MOVQ (R14), BP VMOVDQU64 Z23, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x7_64_loop VZEROUPPER mulGFNI_10x7_64_end: RET // func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x7(SB), $8-88 // Loading 7 of 70 tables to registers // Destination kept on stack // Full registers estimated 79 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x7_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x7_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs MOVQ (R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x7_loop VZEROUPPER mulAvxGFNI_10x7_end: RET // func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 // Loading 23 of 70 tables to registers // Destination kept on stack // Full registers estimated 79 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x7_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 VBROADCASTF32X2 176(CX), Z22 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x7_64Xor_loop: // Load 7 outputs MOVQ (R14), BP VMOVDQU64 (BP)(R15*1), Z23 MOVQ 24(R14), BP VMOVDQU64 (BP)(R15*1), Z24 MOVQ 48(R14), BP VMOVDQU64 (BP)(R15*1), Z25 MOVQ 72(R14), BP VMOVDQU64 (BP)(R15*1), Z26 MOVQ 96(R14), BP VMOVDQU64 (BP)(R15*1), Z27 MOVQ 120(R14), BP VMOVDQU64 (BP)(R15*1), Z28 MOVQ 144(R14), BP VMOVDQU64 (BP)(R15*1), Z29 // Load and process 64 bytes from input 0 to 7 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 7 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 7 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 7 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 7 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 7 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 7 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 7 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 7 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 7 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 7 outputs MOVQ (R14), BP VMOVDQU64 Z23, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x7_64Xor_loop VZEROUPPER mulGFNI_10x7_64Xor_end: RET // func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88 // Loading 7 of 70 tables to registers // Destination kept on stack // Full registers estimated 79 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x7Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 VBROADCASTSD 48(CX), Y6 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x7Xor_loop: // Load 7 outputs MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y7 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y8 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y9 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y10 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y11 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y12 MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y13 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y11, Y15, Y11 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y12, Y15, Y12 VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 7 outputs MOVQ (R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x7Xor_loop VZEROUPPER mulAvxGFNI_10x7Xor_end: RET // func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 152 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 mulAvxTwo_10x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 ADDQ $0x20, R13 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3584(CX), Y8 VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 4032(CX), Y8 VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y6) // Store 7 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x7Xor_loop VZEROUPPER mulAvxTwo_10x7Xor_end: RET // func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 173 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 mulAvxTwo_10x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 ADDQ $0x20, R13 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4096(CX), Y9 VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4608(CX), Y9 VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x8_loop VZEROUPPER mulAvxTwo_10x8_end: RET // func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64(SB), $8-88 // Loading 22 of 80 tables to registers // Destination kept on stack // Full registers estimated 90 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x8_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x8_64_loop: // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 8 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 8 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 8 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R14), BP VMOVDQU64 Z22, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z23, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x8_64_loop VZEROUPPER mulGFNI_10x8_64_end: RET // func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x8(SB), $8-88 // Loading 6 of 80 tables to registers // Destination kept on stack // Full registers estimated 90 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x8_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x8_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 VBROADCASTSD 48(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 56(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x8_loop VZEROUPPER mulAvxGFNI_10x8_end: RET // func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 // Loading 22 of 80 tables to registers // Destination kept on stack // Full registers estimated 90 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x8_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 VBROADCASTF32X2 168(CX), Z21 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x8_64Xor_loop: // Load 8 outputs MOVQ (R14), BP VMOVDQU64 (BP)(R15*1), Z22 MOVQ 24(R14), BP VMOVDQU64 (BP)(R15*1), Z23 MOVQ 48(R14), BP VMOVDQU64 (BP)(R15*1), Z24 MOVQ 72(R14), BP VMOVDQU64 (BP)(R15*1), Z25 MOVQ 96(R14), BP VMOVDQU64 (BP)(R15*1), Z26 MOVQ 120(R14), BP VMOVDQU64 (BP)(R15*1), Z27 MOVQ 144(R14), BP VMOVDQU64 (BP)(R15*1), Z28 MOVQ 168(R14), BP VMOVDQU64 (BP)(R15*1), Z29 // Load and process 64 bytes from input 0 to 8 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 8 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 8 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 8 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 8 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 8 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 8 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 8 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 8 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 8 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 8 outputs MOVQ (R14), BP VMOVDQU64 Z22, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z23, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x8_64Xor_loop VZEROUPPER mulGFNI_10x8_64Xor_end: RET // func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88 // Loading 6 of 80 tables to registers // Destination kept on stack // Full registers estimated 90 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x8Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 VBROADCASTSD 40(CX), Y5 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x8Xor_loop: // Load 8 outputs MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y6 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y7 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y8 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y9 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y10 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y11 MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y12 MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y13 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y9, Y15, Y9 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y10, Y15, Y10 VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 8 outputs MOVQ (R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x8Xor_loop VZEROUPPER mulAvxGFNI_10x8Xor_end: RET // func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 173 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 mulAvxTwo_10x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 ADDQ $0x20, R13 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4096(CX), Y9 VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4608(CX), Y9 VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) // Store 8 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x8Xor_loop VZEROUPPER mulAvxTwo_10x8Xor_end: RET // func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 194 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 mulAvxTwo_10x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 ADDQ $0x20, R13 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4608(CX), Y10 VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 5184(CX), Y10 VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y8, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x9_loop VZEROUPPER mulAvxTwo_10x9_end: RET // func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64(SB), $8-88 // Loading 21 of 90 tables to registers // Destination kept on stack // Full registers estimated 101 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x9_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x9_64_loop: // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 9 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 9 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 9 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R14), BP VMOVDQU64 Z21, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z22, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z23, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x9_64_loop VZEROUPPER mulGFNI_10x9_64_end: RET // func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x9(SB), $8-88 // Loading 5 of 90 tables to registers // Destination kept on stack // Full registers estimated 101 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x9_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x9_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 VBROADCASTSD 40(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 48(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 56(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 64(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 640(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 648(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 656(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 664(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 672(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 680(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 688(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 696(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 704(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 712(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x9_loop VZEROUPPER mulAvxGFNI_10x9_end: RET // func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 // Loading 21 of 90 tables to registers // Destination kept on stack // Full registers estimated 101 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x9_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 VBROADCASTF32X2 160(CX), Z20 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x9_64Xor_loop: // Load 9 outputs MOVQ (R14), BP VMOVDQU64 (BP)(R15*1), Z21 MOVQ 24(R14), BP VMOVDQU64 (BP)(R15*1), Z22 MOVQ 48(R14), BP VMOVDQU64 (BP)(R15*1), Z23 MOVQ 72(R14), BP VMOVDQU64 (BP)(R15*1), Z24 MOVQ 96(R14), BP VMOVDQU64 (BP)(R15*1), Z25 MOVQ 120(R14), BP VMOVDQU64 (BP)(R15*1), Z26 MOVQ 144(R14), BP VMOVDQU64 (BP)(R15*1), Z27 MOVQ 168(R14), BP VMOVDQU64 (BP)(R15*1), Z28 MOVQ 192(R14), BP VMOVDQU64 (BP)(R15*1), Z29 // Load and process 64 bytes from input 0 to 9 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 9 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 9 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 9 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 9 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 9 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 9 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 9 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 9 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 9 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 9 outputs MOVQ (R14), BP VMOVDQU64 Z21, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z22, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z23, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x9_64Xor_loop VZEROUPPER mulGFNI_10x9_64Xor_end: RET // func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88 // Loading 5 of 90 tables to registers // Destination kept on stack // Full registers estimated 101 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x9Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 VBROADCASTSD 32(CX), Y4 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x9Xor_loop: // Load 9 outputs MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y5 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y6 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y7 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y8 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y9 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y10 MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y11 MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y12 MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y13 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y7, Y15, Y7 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y8, Y15, Y8 VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 640(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 648(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 656(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 664(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 672(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 680(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 688(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 696(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 704(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 712(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 9 outputs MOVQ (R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x9Xor_loop VZEROUPPER mulAvxGFNI_10x9Xor_end: RET // func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 194 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 mulAvxTwo_10x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 ADDQ $0x20, R13 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4608(CX), Y10 VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 5184(CX), Y10 VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y8) // Store 9 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y8, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x9Xor_loop VZEROUPPER mulAvxTwo_10x9Xor_end: RET // func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 215 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 mulAvxTwo_10x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 ADDQ $0x20, R13 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5120(CX), Y11 VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5760(CX), Y11 VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 216(R14), BP VMOVDQU Y9, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x10_loop VZEROUPPER mulAvxTwo_10x10_end: RET // func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64(SB), $8-88 // Loading 20 of 100 tables to registers // Destination kept on stack // Full registers estimated 112 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 10 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 10 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 10 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R14), BP VMOVDQU64 Z20, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z21, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z22, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z23, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 216(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x10_64_loop VZEROUPPER mulGFNI_10x10_64_end: RET // func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x10(SB), $8-88 // Loading 4 of 100 tables to registers // Destination kept on stack // Full registers estimated 112 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x10_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x10_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 VBROADCASTSD 32(CX), Y8 VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 VBROADCASTSD 40(CX), Y9 VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 VBROADCASTSD 48(CX), Y10 VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 VBROADCASTSD 56(CX), Y11 VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 VBROADCASTSD 64(CX), Y12 VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 VBROADCASTSD 72(CX), Y13 VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 640(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 648(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 656(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 664(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 672(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 680(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 688(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 696(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 704(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 712(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 720(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 728(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 736(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 744(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 752(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 760(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 768(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 776(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 784(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 792(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 216(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x10_loop VZEROUPPER mulAvxGFNI_10x10_end: RET // func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 // Loading 20 of 100 tables to registers // Destination kept on stack // Full registers estimated 112 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulGFNI_10x10_64Xor_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 VBROADCASTF32X2 24(CX), Z3 VBROADCASTF32X2 32(CX), Z4 VBROADCASTF32X2 40(CX), Z5 VBROADCASTF32X2 48(CX), Z6 VBROADCASTF32X2 56(CX), Z7 VBROADCASTF32X2 64(CX), Z8 VBROADCASTF32X2 72(CX), Z9 VBROADCASTF32X2 80(CX), Z10 VBROADCASTF32X2 88(CX), Z11 VBROADCASTF32X2 96(CX), Z12 VBROADCASTF32X2 104(CX), Z13 VBROADCASTF32X2 112(CX), Z14 VBROADCASTF32X2 120(CX), Z15 VBROADCASTF32X2 128(CX), Z16 VBROADCASTF32X2 136(CX), Z17 VBROADCASTF32X2 144(CX), Z18 VBROADCASTF32X2 152(CX), Z19 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulGFNI_10x10_64Xor_loop: // Load 10 outputs MOVQ (R14), BP VMOVDQU64 (BP)(R15*1), Z20 MOVQ 24(R14), BP VMOVDQU64 (BP)(R15*1), Z21 MOVQ 48(R14), BP VMOVDQU64 (BP)(R15*1), Z22 MOVQ 72(R14), BP VMOVDQU64 (BP)(R15*1), Z23 MOVQ 96(R14), BP VMOVDQU64 (BP)(R15*1), Z24 MOVQ 120(R14), BP VMOVDQU64 (BP)(R15*1), Z25 MOVQ 144(R14), BP VMOVDQU64 (BP)(R15*1), Z26 MOVQ 168(R14), BP VMOVDQU64 (BP)(R15*1), Z27 MOVQ 192(R14), BP VMOVDQU64 (BP)(R15*1), Z28 MOVQ 216(R14), BP VMOVDQU64 (BP)(R15*1), Z29 // Load and process 64 bytes from input 0 to 10 outputs VMOVDQU64 (BX), Z30 ADDQ $0x40, BX VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 10 outputs VMOVDQU64 (SI), Z30 ADDQ $0x40, SI VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 2 to 10 outputs VMOVDQU64 (DI), Z30 ADDQ $0x40, DI VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 3 to 10 outputs VMOVDQU64 (R8), Z30 ADDQ $0x40, R8 VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 4 to 10 outputs VMOVDQU64 (R9), Z30 ADDQ $0x40, R9 VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 5 to 10 outputs VMOVDQU64 (R10), Z30 ADDQ $0x40, R10 VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 6 to 10 outputs VMOVDQU64 (R11), Z30 ADDQ $0x40, R11 VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 7 to 10 outputs VMOVDQU64 (R12), Z30 ADDQ $0x40, R12 VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 8 to 10 outputs VMOVDQU64 (R13), Z30 ADDQ $0x40, R13 VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 9 to 10 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 VXORPD Z20, Z31, Z20 VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 VXORPD Z21, Z31, Z21 VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 VXORPD Z22, Z31, Z22 VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 VXORPD Z23, Z31, Z23 VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 VXORPD Z24, Z31, Z24 VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 VXORPD Z25, Z31, Z25 VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 VXORPD Z26, Z31, Z26 VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 VXORPD Z27, Z31, Z27 VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 VXORPD Z28, Z31, Z28 VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 VXORPD Z29, Z31, Z29 // Store 10 outputs MOVQ (R14), BP VMOVDQU64 Z20, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU64 Z21, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU64 Z22, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU64 Z23, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU64 Z24, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU64 Z25, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU64 Z26, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU64 Z27, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU64 Z28, (BP)(R15*1) MOVQ 216(R14), BP VMOVDQU64 Z29, (BP)(R15*1) // Prepare for next loop ADDQ $0x40, R15 DECQ AX JNZ mulGFNI_10x10_64Xor_loop VZEROUPPER mulGFNI_10x10_64Xor_end: RET // func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, GFNI TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88 // Loading 4 of 100 tables to registers // Destination kept on stack // Full registers estimated 112 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxGFNI_10x10Xor_end VBROADCASTSD (CX), Y0 VBROADCASTSD 8(CX), Y1 VBROADCASTSD 16(CX), Y2 VBROADCASTSD 24(CX), Y3 MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX mulAvxGFNI_10x10Xor_loop: // Load 10 outputs MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y4 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y5 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y6 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y7 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y8 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y9 MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y10 MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y11 MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y12 MOVQ 216(R14), BP VMOVDQU (BP)(R15*1), Y13 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 VXORPD Y4, Y15, Y4 VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 VXORPD Y5, Y15, Y5 VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 VXORPD Y6, Y15, Y6 VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 32(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 40(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 48(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 56(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 64(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 72(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VBROADCASTSD 80(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 88(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 96(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 104(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 112(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 120(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 128(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 136(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 144(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 152(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VBROADCASTSD 160(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 168(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 176(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 184(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 192(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 200(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 208(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 216(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 224(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 232(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VBROADCASTSD 240(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 248(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 256(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 264(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 272(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 280(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 288(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 296(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 304(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 312(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y14 ADDQ $0x20, R9 VBROADCASTSD 320(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 328(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 336(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 344(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 352(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 360(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 368(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 376(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 384(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 392(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y14 ADDQ $0x20, R10 VBROADCASTSD 400(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 408(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 416(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 424(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 432(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 440(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 448(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 456(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 464(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 472(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y14 ADDQ $0x20, R11 VBROADCASTSD 480(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 488(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 496(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 504(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 512(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 520(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 528(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 536(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 544(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 552(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y14 ADDQ $0x20, R12 VBROADCASTSD 560(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 568(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 576(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 584(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 592(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 600(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 608(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 616(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 624(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 632(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y14 ADDQ $0x20, R13 VBROADCASTSD 640(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 648(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 656(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 664(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 672(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 680(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 688(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 696(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 704(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 712(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VBROADCASTSD 720(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y4, Y15, Y4 VBROADCASTSD 728(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y5, Y15, Y5 VBROADCASTSD 736(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y6, Y15, Y6 VBROADCASTSD 744(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y7, Y15, Y7 VBROADCASTSD 752(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y8, Y15, Y8 VBROADCASTSD 760(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y9, Y15, Y9 VBROADCASTSD 768(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y10, Y15, Y10 VBROADCASTSD 776(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y11, Y15, Y11 VBROADCASTSD 784(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y12, Y15, Y12 VBROADCASTSD 792(CX), Y15 VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 VXORPD Y13, Y15, Y13 // Store 10 outputs MOVQ (R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y9, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y10, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y11, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y12, (BP)(R15*1) MOVQ 216(R14), BP VMOVDQU Y13, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxGFNI_10x10Xor_loop VZEROUPPER mulAvxGFNI_10x10Xor_end: RET // func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 215 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 mulAvxTwo_10x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R14), BP VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R14), BP VMOVDQU (BP)(R15*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 ADDQ $0x20, R13 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5120(CX), Y11 VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5760(CX), Y11 VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) // Store 10 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 216(R14), BP VMOVDQU Y9, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x10Xor_loop VZEROUPPER mulAvxTwo_10x10Xor_end: RET // func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT2_avx2(SB), NOSPLIT, $0-56 MOVQ table+48(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 64(AX), Y1 VBROADCASTI128 16(AX), Y2 VBROADCASTI128 80(AX), Y3 VBROADCASTI128 32(AX), Y4 VBROADCASTI128 96(AX), Y5 VBROADCASTI128 48(AX), Y6 VBROADCASTI128 112(AX), Y7 MOVQ x_len+8(FP), AX MOVQ x_base+0(FP), CX MOVQ y_base+24(FP), DX MOVQ $0x0000000f, BX MOVQ BX, X8 VPBROADCASTB X8, Y8 loop: VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y12 VPXOR Y11, Y9, Y11 VPXOR Y12, Y10, Y12 VMOVDQU Y11, (DX) VMOVDQU Y12, 32(DX) VPSRLQ $0x04, Y11, Y13 VPAND Y8, Y11, Y11 VPAND Y8, Y13, Y13 VPSHUFB Y11, Y0, Y14 VPSHUFB Y11, Y1, Y11 VPSHUFB Y13, Y2, Y15 VPSHUFB Y13, Y3, Y13 VPXOR Y14, Y15, Y14 VPXOR Y11, Y13, Y11 VPAND Y12, Y8, Y13 VPSRLQ $0x04, Y12, Y12 VPAND Y8, Y12, Y12 VPSHUFB Y13, Y4, Y15 VPSHUFB Y13, Y5, Y13 VPXOR Y14, Y15, Y14 VPXOR Y11, Y13, Y11 VPSHUFB Y12, Y6, Y15 VPSHUFB Y12, Y7, Y13 XOR3WAY( $0x00, Y14, Y15, Y9) XOR3WAY( $0x00, Y11, Y13, Y10) VMOVDQU Y9, (CX) VMOVDQU Y10, 32(CX) ADDQ $0x40, CX ADDQ $0x40, DX SUBQ $0x40, AX JNZ loop VZEROUPPER RET // func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT2_avx2(SB), NOSPLIT, $0-56 MOVQ table+48(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 64(AX), Y1 VBROADCASTI128 16(AX), Y2 VBROADCASTI128 80(AX), Y3 VBROADCASTI128 32(AX), Y4 VBROADCASTI128 96(AX), Y5 VBROADCASTI128 48(AX), Y6 VBROADCASTI128 112(AX), Y7 MOVQ x_len+8(FP), AX MOVQ x_base+0(FP), CX MOVQ y_base+24(FP), DX MOVQ $0x0000000f, BX MOVQ BX, X8 VPBROADCASTB X8, Y8 loop: VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y12 VPSRLQ $0x04, Y11, Y13 VPAND Y8, Y11, Y11 VPAND Y8, Y13, Y13 VPSHUFB Y11, Y0, Y14 VPSHUFB Y11, Y1, Y11 VPSHUFB Y13, Y2, Y15 VPSHUFB Y13, Y3, Y13 VPXOR Y14, Y15, Y14 VPXOR Y11, Y13, Y11 VPAND Y12, Y8, Y13 VPSRLQ $0x04, Y12, Y12 VPAND Y8, Y12, Y12 VPSHUFB Y13, Y4, Y15 VPSHUFB Y13, Y5, Y13 VPXOR Y14, Y15, Y14 VPXOR Y11, Y13, Y11 VPSHUFB Y12, Y6, Y15 VPSHUFB Y12, Y7, Y13 XOR3WAY( $0x00, Y14, Y15, Y9) XOR3WAY( $0x00, Y11, Y13, Y10) VMOVDQU Y9, (CX) VMOVDQU Y10, 32(CX) VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y12 VPXOR Y11, Y9, Y11 VPXOR Y12, Y10, Y12 VMOVDQU Y11, (DX) VMOVDQU Y12, 32(DX) ADDQ $0x40, CX ADDQ $0x40, DX SUBQ $0x40, AX JNZ loop VZEROUPPER RET // func mulgf16_avx2(x []byte, y []byte, table *[128]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·mulgf16_avx2(SB), NOSPLIT, $0-56 MOVQ table+48(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 64(AX), Y1 VBROADCASTI128 16(AX), Y2 VBROADCASTI128 80(AX), Y3 VBROADCASTI128 32(AX), Y4 VBROADCASTI128 96(AX), Y5 VBROADCASTI128 48(AX), Y6 VBROADCASTI128 112(AX), Y7 MOVQ x_len+8(FP), AX MOVQ x_base+0(FP), CX MOVQ y_base+24(FP), DX MOVQ $0x0000000f, BX MOVQ BX, X8 VPBROADCASTB X8, Y8 loop: VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y10 VPSRLQ $0x04, Y9, Y11 VPAND Y8, Y9, Y9 VPAND Y8, Y11, Y11 VPSHUFB Y9, Y0, Y12 VPSHUFB Y9, Y1, Y9 VPSHUFB Y11, Y2, Y13 VPSHUFB Y11, Y3, Y11 VPXOR Y12, Y13, Y12 VPXOR Y9, Y11, Y9 VPAND Y10, Y8, Y11 VPSRLQ $0x04, Y10, Y10 VPAND Y8, Y10, Y10 VPSHUFB Y11, Y4, Y13 VPSHUFB Y11, Y5, Y11 VPXOR Y12, Y13, Y12 VPXOR Y9, Y11, Y9 VPSHUFB Y10, Y6, Y13 VPSHUFB Y10, Y7, Y11 VPXOR Y12, Y13, Y12 VPXOR Y9, Y11, Y9 VMOVDQU Y12, (CX) VMOVDQU Y9, 32(CX) ADDQ $0x40, CX ADDQ $0x40, DX SUBQ $0x40, AX JNZ loop VZEROUPPER RET // func ifftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx512_0(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), DX VBROADCASTI128 (DX), Y1 VBROADCASTI128 64(DX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(DX), Y1 VBROADCASTI128 80(DX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(DX), Y1 VBROADCASTI128 96(DX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(DX), Y1 VBROADCASTI128 112(DX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z24 VMOVAPS Z0, Z25 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z26 VMOVAPS Z0, Z27 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z28 VMOVAPS Z0, Z29 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z30 VMOVAPS Z0, Z31 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ AX, SI MOVQ (DX)(SI*1), R8 ADDQ AX, SI MOVQ (DX)(SI*1), R9 ADDQ AX, SI MOVQ (DX)(SI*1), AX loop: VMOVDQU (DI), Y1 VMOVDQU 32(DI), Y2 VMOVDQU (R8), Y3 VMOVDQU 32(R8), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VPSRLQ $0x04, Y3, Y6 VPAND Y0, Y3, Y5 VPAND Y0, Y6, Y6 VPSHUFB Y5, Y24, Y7 VPSHUFB Y5, Y25, Y5 VPSHUFB Y6, Y26, Y8 VPSHUFB Y6, Y27, Y6 VPXOR Y7, Y8, Y7 VPXOR Y5, Y6, Y5 VPAND Y4, Y0, Y6 VPSRLQ $0x04, Y4, Y8 VPAND Y0, Y8, Y8 VPSHUFB Y6, Y28, Y9 VPSHUFB Y6, Y29, Y6 VPXOR Y7, Y9, Y7 VPXOR Y5, Y6, Y5 VPSHUFB Y8, Y30, Y9 VPSHUFB Y8, Y31, Y6 VPTERNLOGD $0x96, Y7, Y9, Y1 VPTERNLOGD $0x96, Y5, Y6, Y2 VMOVDQU (R9), Y5 VMOVDQU 32(R9), Y6 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 VPTERNLOGD $0x96, Y11, Y13, Y5 VPTERNLOGD $0x96, Y9, Y10, Y6 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y3 VPTERNLOGD $0x96, Y9, Y10, Y4 VMOVDQU Y1, (DI) VMOVDQU Y2, 32(DI) ADDQ $0x40, DI VMOVDQU Y3, (R8) VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 VMOVDQU Y5, (R9) VMOVDQU Y6, 32(R9) ADDQ $0x40, R9 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, BX JNZ loop VZEROUPPER RET // func fftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx512_0(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), DX VBROADCASTI128 (DX), Y1 VBROADCASTI128 64(DX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(DX), Y1 VBROADCASTI128 80(DX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(DX), Y1 VBROADCASTI128 96(DX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(DX), Y1 VBROADCASTI128 112(DX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z24 VMOVAPS Z0, Z25 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z26 VMOVAPS Z0, Z27 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z28 VMOVAPS Z0, Z29 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z30 VMOVAPS Z0, Z31 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ AX, SI MOVQ (DX)(SI*1), R8 ADDQ AX, SI MOVQ (DX)(SI*1), R9 ADDQ AX, SI MOVQ (DX)(SI*1), AX loop: VMOVDQU (DI), Y1 VMOVDQU 32(DI), Y2 VMOVDQU (R9), Y5 VMOVDQU 32(R9), Y6 VMOVDQU (R8), Y3 VMOVDQU 32(R8), Y4 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y3 VPTERNLOGD $0x96, Y9, Y10, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y3, Y10 VPAND Y0, Y3, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y24, Y11 VPSHUFB Y9, Y25, Y9 VPSHUFB Y10, Y26, Y12 VPSHUFB Y10, Y27, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y4, Y0, Y10 VPSRLQ $0x04, Y4, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y28, Y13 VPSHUFB Y10, Y29, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y30, Y13 VPSHUFB Y12, Y31, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (DI) VMOVDQU Y2, 32(DI) ADDQ $0x40, DI VMOVDQU Y3, (R8) VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 VPSRLQ $0x04, Y7, Y2 VPAND Y0, Y7, Y1 VPAND Y0, Y2, Y2 VBROADCASTI128 (CX), Y3 VBROADCASTI128 64(CX), Y4 VPSHUFB Y1, Y3, Y3 VPSHUFB Y1, Y4, Y1 VBROADCASTI128 16(CX), Y4 VBROADCASTI128 80(CX), Y9 VPSHUFB Y2, Y4, Y4 VPSHUFB Y2, Y9, Y2 VPXOR Y3, Y4, Y3 VPXOR Y1, Y2, Y1 VPAND Y8, Y0, Y2 VPSRLQ $0x04, Y8, Y4 VPAND Y0, Y4, Y4 VBROADCASTI128 32(CX), Y9 VBROADCASTI128 96(CX), Y10 VPSHUFB Y2, Y9, Y9 VPSHUFB Y2, Y10, Y2 VPXOR Y3, Y9, Y3 VPXOR Y1, Y2, Y1 VBROADCASTI128 48(CX), Y9 VBROADCASTI128 112(CX), Y2 VPSHUFB Y4, Y9, Y9 VPSHUFB Y4, Y2, Y2 VPTERNLOGD $0x96, Y3, Y9, Y5 VPTERNLOGD $0x96, Y1, Y2, Y6 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R9) VMOVDQU Y6, 32(R9) ADDQ $0x40, R9 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, BX JNZ loop VZEROUPPER RET // func ifftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx512_1(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), CX VBROADCASTI128 (CX), Y1 VBROADCASTI128 64(CX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(CX), Y1 VBROADCASTI128 80(CX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(CX), Y1 VBROADCASTI128 96(CX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(CX), Y1 VBROADCASTI128 112(CX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z24 VMOVAPS Z0, Z25 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z26 VMOVAPS Z0, Z27 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z28 VMOVAPS Z0, Z29 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z30 VMOVAPS Z0, Z31 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y24, Y11 VPSHUFB Y9, Y25, Y9 VPSHUFB Y10, Y26, Y12 VPSHUFB Y10, Y27, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y28, Y13 VPSHUFB Y10, Y29, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y30, Y13 VPSHUFB Y12, Y31, Y10 VPTERNLOGD $0x96, Y11, Y13, Y5 VPTERNLOGD $0x96, Y9, Y10, Y6 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y3 VPTERNLOGD $0x96, Y9, Y10, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func fftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx512_1(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), DX VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 VBROADCASTI128 (CX), Y1 VBROADCASTI128 64(CX), Y0 VMOVAPS Z1, Z24 VMOVAPS Z0, Z25 VBROADCASTI128 16(CX), Y1 VBROADCASTI128 80(CX), Y0 VMOVAPS Z1, Z26 VMOVAPS Z0, Z27 VBROADCASTI128 32(CX), Y1 VBROADCASTI128 96(CX), Y0 VMOVAPS Z1, Z28 VMOVAPS Z0, Z29 VBROADCASTI128 48(CX), Y1 VBROADCASTI128 112(CX), Y0 VMOVAPS Z1, Z30 VMOVAPS Z0, Z31 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y3, Y10 VPAND Y0, Y3, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y4, Y0, Y10 VPSRLQ $0x04, Y4, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VPSRLQ $0x04, Y7, Y2 VPAND Y0, Y7, Y1 VPAND Y0, Y2, Y2 VPSHUFB Y1, Y24, Y3 VPSHUFB Y1, Y25, Y1 VPSHUFB Y2, Y26, Y4 VPSHUFB Y2, Y27, Y2 VPXOR Y3, Y4, Y3 VPXOR Y1, Y2, Y1 VPAND Y8, Y0, Y2 VPSRLQ $0x04, Y8, Y4 VPAND Y0, Y4, Y4 VPSHUFB Y2, Y28, Y9 VPSHUFB Y2, Y29, Y2 VPXOR Y3, Y9, Y3 VPXOR Y1, Y2, Y1 VPSHUFB Y4, Y30, Y9 VPSHUFB Y4, Y31, Y2 VPTERNLOGD $0x96, Y3, Y9, Y5 VPTERNLOGD $0x96, Y1, Y2, Y6 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func ifftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx512_2(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), CX VBROADCASTI128 (CX), Y1 VBROADCASTI128 64(CX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(CX), Y1 VBROADCASTI128 80(CX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(CX), Y1 VBROADCASTI128 96(CX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(CX), Y1 VBROADCASTI128 112(CX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z24 VMOVAPS Z0, Z25 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z26 VMOVAPS Z0, Z27 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z28 VMOVAPS Z0, Z29 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z30 VMOVAPS Z0, Z31 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VPSRLQ $0x04, Y3, Y6 VPAND Y0, Y3, Y5 VPAND Y0, Y6, Y6 VPSHUFB Y5, Y24, Y7 VPSHUFB Y5, Y25, Y5 VPSHUFB Y6, Y26, Y8 VPSHUFB Y6, Y27, Y6 VPXOR Y7, Y8, Y7 VPXOR Y5, Y6, Y5 VPAND Y4, Y0, Y6 VPSRLQ $0x04, Y4, Y8 VPAND Y0, Y8, Y8 VPSHUFB Y6, Y28, Y9 VPSHUFB Y6, Y29, Y6 VPXOR Y7, Y9, Y7 VPXOR Y5, Y6, Y5 VPSHUFB Y8, Y30, Y9 VPSHUFB Y8, Y31, Y6 VPTERNLOGD $0x96, Y7, Y9, Y1 VPTERNLOGD $0x96, Y5, Y6, Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y3 VPTERNLOGD $0x96, Y9, Y10, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func fftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx512_2(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), CX VBROADCASTI128 (CX), Y1 VBROADCASTI128 64(CX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(CX), Y1 VBROADCASTI128 80(CX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(CX), Y1 VBROADCASTI128 96(CX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(CX), Y1 VBROADCASTI128 112(CX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z24 VMOVAPS Z0, Z25 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z26 VMOVAPS Z0, Z27 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z28 VMOVAPS Z0, Z29 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z30 VMOVAPS Z0, Z31 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y3 VPTERNLOGD $0x96, Y9, Y10, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VPSRLQ $0x04, Y7, Y2 VPAND Y0, Y7, Y1 VPAND Y0, Y2, Y2 VPSHUFB Y1, Y24, Y3 VPSHUFB Y1, Y25, Y1 VPSHUFB Y2, Y26, Y4 VPSHUFB Y2, Y27, Y2 VPXOR Y3, Y4, Y3 VPXOR Y1, Y2, Y1 VPAND Y8, Y0, Y2 VPSRLQ $0x04, Y8, Y4 VPAND Y0, Y4, Y4 VPSHUFB Y2, Y28, Y9 VPSHUFB Y2, Y29, Y2 VPXOR Y3, Y9, Y3 VPXOR Y1, Y2, Y1 VPSHUFB Y4, Y30, Y9 VPSHUFB Y4, Y31, Y2 VPTERNLOGD $0x96, Y3, Y9, Y5 VPTERNLOGD $0x96, Y1, Y2, Y6 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func ifftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx512_3(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), AX VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y3 VPTERNLOGD $0x96, Y9, Y10, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func fftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx512_3(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), CX VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VPSRLQ $0x04, Y7, Y2 VPAND Y0, Y7, Y1 VPAND Y0, Y2, Y2 VPSHUFB Y1, Y16, Y3 VPSHUFB Y1, Y17, Y1 VPSHUFB Y2, Y18, Y4 VPSHUFB Y2, Y19, Y2 VPXOR Y3, Y4, Y3 VPXOR Y1, Y2, Y1 VPAND Y8, Y0, Y2 VPSRLQ $0x04, Y8, Y4 VPAND Y0, Y4, Y4 VPSHUFB Y2, Y20, Y9 VPSHUFB Y2, Y21, Y2 VPXOR Y3, Y9, Y3 VPXOR Y1, Y2, Y1 VPSHUFB Y4, Y22, Y9 VPSHUFB Y4, Y23, Y2 VPTERNLOGD $0x96, Y3, Y9, Y5 VPTERNLOGD $0x96, Y1, Y2, Y6 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func ifftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx512_4(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), DX VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 VBROADCASTI128 (CX), Y1 VBROADCASTI128 64(CX), Y0 VMOVAPS Z1, Z24 VMOVAPS Z0, Z25 VBROADCASTI128 16(CX), Y1 VBROADCASTI128 80(CX), Y0 VMOVAPS Z1, Z26 VMOVAPS Z0, Z27 VBROADCASTI128 32(CX), Y1 VBROADCASTI128 96(CX), Y0 VMOVAPS Z1, Z28 VMOVAPS Z0, Z29 VBROADCASTI128 48(CX), Y1 VBROADCASTI128 112(CX), Y0 VMOVAPS Z1, Z30 VMOVAPS Z0, Z31 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VPSRLQ $0x04, Y3, Y6 VPAND Y0, Y3, Y5 VPAND Y0, Y6, Y6 VPSHUFB Y5, Y16, Y7 VPSHUFB Y5, Y17, Y5 VPSHUFB Y6, Y18, Y8 VPSHUFB Y6, Y19, Y6 VPXOR Y7, Y8, Y7 VPXOR Y5, Y6, Y5 VPAND Y4, Y0, Y6 VPSRLQ $0x04, Y4, Y8 VPAND Y0, Y8, Y8 VPSHUFB Y6, Y20, Y9 VPSHUFB Y6, Y21, Y6 VPXOR Y7, Y9, Y7 VPXOR Y5, Y6, Y5 VPSHUFB Y8, Y22, Y9 VPSHUFB Y8, Y23, Y6 VPTERNLOGD $0x96, Y7, Y9, Y1 VPTERNLOGD $0x96, Y5, Y6, Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y24, Y11 VPSHUFB Y9, Y25, Y9 VPSHUFB Y10, Y26, Y12 VPSHUFB Y10, Y27, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y28, Y13 VPSHUFB Y10, Y29, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y30, Y13 VPSHUFB Y12, Y31, Y10 VPTERNLOGD $0x96, Y11, Y13, Y5 VPTERNLOGD $0x96, Y9, Y10, Y6 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func fftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx512_4(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), CX VBROADCASTI128 (CX), Y1 VBROADCASTI128 64(CX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(CX), Y1 VBROADCASTI128 80(CX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(CX), Y1 VBROADCASTI128 96(CX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(CX), Y1 VBROADCASTI128 112(CX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z24 VMOVAPS Z0, Z25 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z26 VMOVAPS Z0, Z27 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z28 VMOVAPS Z0, Z29 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z30 VMOVAPS Z0, Z31 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y3 VPTERNLOGD $0x96, Y9, Y10, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y3, Y10 VPAND Y0, Y3, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y24, Y11 VPSHUFB Y9, Y25, Y9 VPSHUFB Y10, Y26, Y12 VPSHUFB Y10, Y27, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y4, Y0, Y10 VPSRLQ $0x04, Y4, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y28, Y13 VPSHUFB Y10, Y29, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y30, Y13 VPSHUFB Y12, Y31, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func ifftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx512_5(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), CX VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y5 VPTERNLOGD $0x96, Y9, Y10, Y6 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func fftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx512_5(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), CX VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y3, Y10 VPAND Y0, Y3, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y4, Y0, Y10 VPSRLQ $0x04, Y4, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func ifftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx512_6(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), CX VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VPSRLQ $0x04, Y3, Y6 VPAND Y0, Y3, Y5 VPAND Y0, Y6, Y6 VPSHUFB Y5, Y16, Y7 VPSHUFB Y5, Y17, Y5 VPSHUFB Y6, Y18, Y8 VPSHUFB Y6, Y19, Y6 VPXOR Y7, Y8, Y7 VPXOR Y5, Y6, Y5 VPAND Y4, Y0, Y6 VPSRLQ $0x04, Y4, Y8 VPAND Y0, Y8, Y8 VPSHUFB Y6, Y20, Y9 VPSHUFB Y6, Y21, Y6 VPXOR Y7, Y9, Y7 VPXOR Y5, Y6, Y5 VPSHUFB Y8, Y22, Y9 VPSHUFB Y8, Y23, Y6 VPTERNLOGD $0x96, Y7, Y9, Y1 VPTERNLOGD $0x96, Y5, Y6, Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func fftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx512_6(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), AX VBROADCASTI128 (AX), Y1 VBROADCASTI128 64(AX), Y0 VMOVAPS Z1, Z16 VMOVAPS Z0, Z17 VBROADCASTI128 16(AX), Y1 VBROADCASTI128 80(AX), Y0 VMOVAPS Z1, Z18 VMOVAPS Z0, Z19 VBROADCASTI128 32(AX), Y1 VBROADCASTI128 96(AX), Y0 VMOVAPS Z1, Z20 VMOVAPS Z0, Z21 VBROADCASTI128 48(AX), Y1 VBROADCASTI128 112(AX), Y0 VMOVAPS Z1, Z22 VMOVAPS Z0, Z23 MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (DI), Y3 VMOVDQU 32(DI), Y4 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y1 VPTERNLOGD $0x96, Y9, Y10, Y2 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VPSHUFB Y9, Y16, Y11 VPSHUFB Y9, Y17, Y9 VPSHUFB Y10, Y18, Y12 VPSHUFB Y10, Y19, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VPSHUFB Y10, Y20, Y13 VPSHUFB Y10, Y21, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VPSHUFB Y12, Y22, Y13 VPSHUFB Y12, Y23, Y10 VPTERNLOGD $0x96, Y11, Y13, Y3 VPTERNLOGD $0x96, Y9, Y10, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y3, (DI) VMOVDQU Y4, 32(DI) ADDQ $0x40, DI VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func ifftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·ifftDIT4_avx512_7(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), AX MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y1 VMOVDQU (DI), Y2 VMOVDQU 32(DI), Y3 VPXOR Y0, Y2, Y2 VPXOR Y1, Y3, Y3 VMOVDQU (R8), Y4 VMOVDQU 32(R8), Y5 VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y7 VPXOR Y4, Y6, Y6 VPXOR Y5, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI VMOVDQU Y4, (R8) VMOVDQU Y5, 32(R8) ADDQ $0x40, R8 VMOVDQU Y6, (AX) VMOVDQU Y7, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func fftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·fftDIT4_avx512_7(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), AX MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y1 VMOVDQU (R8), Y4 VMOVDQU 32(R8), Y5 VMOVDQU (DI), Y2 VMOVDQU 32(DI), Y3 VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y7 VPXOR Y0, Y4, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y0, Y2, Y2 VPXOR Y1, Y3, Y3 VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI VPXOR Y4, Y6, Y6 VPXOR Y5, Y7, Y7 VMOVDQU Y4, (R8) VMOVDQU Y5, 32(R8) ADDQ $0x40, R8 VMOVDQU Y6, (AX) VMOVDQU Y7, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func ifftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx2_0(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), DX MOVQ $0x0000000f, BX MOVQ BX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), BX MOVQ work_base+0(FP), SI MOVQ 8(SI), DI XORQ R8, R8 MOVQ (SI)(R8*1), R9 ADDQ BX, R8 MOVQ (SI)(R8*1), R10 ADDQ BX, R8 MOVQ (SI)(R8*1), R11 ADDQ BX, R8 MOVQ (SI)(R8*1), BX loop: VMOVDQU (R9), Y1 VMOVDQU 32(R9), Y2 VMOVDQU (R10), Y3 VMOVDQU 32(R10), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VPSRLQ $0x04, Y3, Y6 VPAND Y0, Y3, Y5 VPAND Y0, Y6, Y6 VBROADCASTI128 (AX), Y7 VBROADCASTI128 64(AX), Y8 VPSHUFB Y5, Y7, Y7 VPSHUFB Y5, Y8, Y5 VBROADCASTI128 16(AX), Y8 VBROADCASTI128 80(AX), Y9 VPSHUFB Y6, Y8, Y8 VPSHUFB Y6, Y9, Y6 VPXOR Y7, Y8, Y7 VPXOR Y5, Y6, Y5 VPAND Y4, Y0, Y6 VPSRLQ $0x04, Y4, Y8 VPAND Y0, Y8, Y8 VBROADCASTI128 32(AX), Y9 VBROADCASTI128 96(AX), Y10 VPSHUFB Y6, Y9, Y9 VPSHUFB Y6, Y10, Y6 VPXOR Y7, Y9, Y7 VPXOR Y5, Y6, Y5 VBROADCASTI128 48(AX), Y9 VBROADCASTI128 112(AX), Y6 VPSHUFB Y8, Y9, Y9 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y7, Y9, Y1) XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R11), Y5 VMOVDQU 32(R11), Y6 VMOVDQU (BX), Y7 VMOVDQU 32(BX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y5) XOR3WAY( $0x00, Y9, Y10, Y6) VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (DX), Y11 VBROADCASTI128 64(DX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(DX), Y12 VBROADCASTI128 80(DX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(DX), Y13 VBROADCASTI128 96(DX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(DX), Y13 VBROADCASTI128 112(DX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (DX), Y11 VBROADCASTI128 64(DX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(DX), Y12 VBROADCASTI128 80(DX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(DX), Y13 VBROADCASTI128 96(DX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(DX), Y13 VBROADCASTI128 112(DX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y3) XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU Y1, (R9) VMOVDQU Y2, 32(R9) ADDQ $0x40, R9 VMOVDQU Y3, (R10) VMOVDQU Y4, 32(R10) ADDQ $0x40, R10 VMOVDQU Y5, (R11) VMOVDQU Y6, 32(R11) ADDQ $0x40, R11 VMOVDQU Y7, (BX) VMOVDQU Y8, 32(BX) ADDQ $0x40, BX SUBQ $0x40, DI JNZ loop VZEROUPPER RET // func fftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx2_0(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), DX MOVQ $0x0000000f, BX MOVQ BX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), BX MOVQ work_base+0(FP), SI MOVQ 8(SI), DI XORQ R8, R8 MOVQ (SI)(R8*1), R9 ADDQ BX, R8 MOVQ (SI)(R8*1), R10 ADDQ BX, R8 MOVQ (SI)(R8*1), R11 ADDQ BX, R8 MOVQ (SI)(R8*1), BX loop: VMOVDQU (R9), Y1 VMOVDQU 32(R9), Y2 VMOVDQU (R11), Y5 VMOVDQU 32(R11), Y6 VMOVDQU (R10), Y3 VMOVDQU 32(R10), Y4 VMOVDQU (BX), Y7 VMOVDQU 32(BX), Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (DX), Y11 VBROADCASTI128 64(DX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(DX), Y12 VBROADCASTI128 80(DX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(DX), Y13 VBROADCASTI128 96(DX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(DX), Y13 VBROADCASTI128 112(DX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (DX), Y11 VBROADCASTI128 64(DX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(DX), Y12 VBROADCASTI128 80(DX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(DX), Y13 VBROADCASTI128 96(DX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(DX), Y13 VBROADCASTI128 112(DX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y3) XOR3WAY( $0x00, Y9, Y10, Y4) VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y3, Y10 VPAND Y0, Y3, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y4, Y0, Y10 VPSRLQ $0x04, Y4, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (R9) VMOVDQU Y2, 32(R9) ADDQ $0x40, R9 VMOVDQU Y3, (R10) VMOVDQU Y4, 32(R10) ADDQ $0x40, R10 VPSRLQ $0x04, Y7, Y2 VPAND Y0, Y7, Y1 VPAND Y0, Y2, Y2 VBROADCASTI128 (CX), Y3 VBROADCASTI128 64(CX), Y4 VPSHUFB Y1, Y3, Y3 VPSHUFB Y1, Y4, Y1 VBROADCASTI128 16(CX), Y4 VBROADCASTI128 80(CX), Y9 VPSHUFB Y2, Y4, Y4 VPSHUFB Y2, Y9, Y2 VPXOR Y3, Y4, Y3 VPXOR Y1, Y2, Y1 VPAND Y8, Y0, Y2 VPSRLQ $0x04, Y8, Y4 VPAND Y0, Y4, Y4 VBROADCASTI128 32(CX), Y9 VBROADCASTI128 96(CX), Y10 VPSHUFB Y2, Y9, Y9 VPSHUFB Y2, Y10, Y2 VPXOR Y3, Y9, Y3 VPXOR Y1, Y2, Y1 VBROADCASTI128 48(CX), Y9 VBROADCASTI128 112(CX), Y2 VPSHUFB Y4, Y9, Y9 VPSHUFB Y4, Y2, Y2 XOR3WAY( $0x00, Y3, Y9, Y5) XOR3WAY( $0x00, Y1, Y2, Y6) VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R11) VMOVDQU Y6, 32(R11) ADDQ $0x40, R11 VMOVDQU Y7, (BX) VMOVDQU Y8, 32(BX) ADDQ $0x40, BX SUBQ $0x40, DI JNZ loop VZEROUPPER RET // func ifftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx2_1(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), CX MOVQ $0x0000000f, DX MOVQ DX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), DX MOVQ work_base+0(FP), BX MOVQ 8(BX), SI XORQ DI, DI MOVQ (BX)(DI*1), R8 ADDQ DX, DI MOVQ (BX)(DI*1), R9 ADDQ DX, DI MOVQ (BX)(DI*1), R10 ADDQ DX, DI MOVQ (BX)(DI*1), DX loop: VMOVDQU (R8), Y1 VMOVDQU 32(R8), Y2 VMOVDQU (R9), Y3 VMOVDQU 32(R9), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU (R10), Y5 VMOVDQU 32(R10), Y6 VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y5) XOR3WAY( $0x00, Y9, Y10, Y6) VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y3) XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU Y1, (R8) VMOVDQU Y2, 32(R8) ADDQ $0x40, R8 VMOVDQU Y3, (R9) VMOVDQU Y4, 32(R9) ADDQ $0x40, R9 VMOVDQU Y5, (R10) VMOVDQU Y6, 32(R10) ADDQ $0x40, R10 VMOVDQU Y7, (DX) VMOVDQU Y8, 32(DX) ADDQ $0x40, DX SUBQ $0x40, SI JNZ loop VZEROUPPER RET // func fftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx2_1(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), DX MOVQ $0x0000000f, DX MOVQ DX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), DX MOVQ work_base+0(FP), BX MOVQ 8(BX), SI XORQ DI, DI MOVQ (BX)(DI*1), R8 ADDQ DX, DI MOVQ (BX)(DI*1), R9 ADDQ DX, DI MOVQ (BX)(DI*1), R10 ADDQ DX, DI MOVQ (BX)(DI*1), DX loop: VMOVDQU (R8), Y1 VMOVDQU 32(R8), Y2 VMOVDQU (R10), Y5 VMOVDQU 32(R10), Y6 VMOVDQU (R9), Y3 VMOVDQU 32(R9), Y4 VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y3, Y10 VPAND Y0, Y3, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y4, Y0, Y10 VPSRLQ $0x04, Y4, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (R8) VMOVDQU Y2, 32(R8) ADDQ $0x40, R8 VMOVDQU Y3, (R9) VMOVDQU Y4, 32(R9) ADDQ $0x40, R9 VPSRLQ $0x04, Y7, Y2 VPAND Y0, Y7, Y1 VPAND Y0, Y2, Y2 VBROADCASTI128 (CX), Y3 VBROADCASTI128 64(CX), Y4 VPSHUFB Y1, Y3, Y3 VPSHUFB Y1, Y4, Y1 VBROADCASTI128 16(CX), Y4 VBROADCASTI128 80(CX), Y9 VPSHUFB Y2, Y4, Y4 VPSHUFB Y2, Y9, Y2 VPXOR Y3, Y4, Y3 VPXOR Y1, Y2, Y1 VPAND Y8, Y0, Y2 VPSRLQ $0x04, Y8, Y4 VPAND Y0, Y4, Y4 VBROADCASTI128 32(CX), Y9 VBROADCASTI128 96(CX), Y10 VPSHUFB Y2, Y9, Y9 VPSHUFB Y2, Y10, Y2 VPXOR Y3, Y9, Y3 VPXOR Y1, Y2, Y1 VBROADCASTI128 48(CX), Y9 VBROADCASTI128 112(CX), Y2 VPSHUFB Y4, Y9, Y9 VPSHUFB Y4, Y2, Y2 XOR3WAY( $0x00, Y3, Y9, Y5) XOR3WAY( $0x00, Y1, Y2, Y6) VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R10) VMOVDQU Y6, 32(R10) ADDQ $0x40, R10 VMOVDQU Y7, (DX) VMOVDQU Y8, 32(DX) ADDQ $0x40, DX SUBQ $0x40, SI JNZ loop VZEROUPPER RET // func ifftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx2_2(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), CX MOVQ $0x0000000f, DX MOVQ DX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), DX MOVQ work_base+0(FP), BX MOVQ 8(BX), SI XORQ DI, DI MOVQ (BX)(DI*1), R8 ADDQ DX, DI MOVQ (BX)(DI*1), R9 ADDQ DX, DI MOVQ (BX)(DI*1), R10 ADDQ DX, DI MOVQ (BX)(DI*1), DX loop: VMOVDQU (R8), Y1 VMOVDQU 32(R8), Y2 VMOVDQU (R9), Y3 VMOVDQU 32(R9), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VPSRLQ $0x04, Y3, Y6 VPAND Y0, Y3, Y5 VPAND Y0, Y6, Y6 VBROADCASTI128 (AX), Y7 VBROADCASTI128 64(AX), Y8 VPSHUFB Y5, Y7, Y7 VPSHUFB Y5, Y8, Y5 VBROADCASTI128 16(AX), Y8 VBROADCASTI128 80(AX), Y9 VPSHUFB Y6, Y8, Y8 VPSHUFB Y6, Y9, Y6 VPXOR Y7, Y8, Y7 VPXOR Y5, Y6, Y5 VPAND Y4, Y0, Y6 VPSRLQ $0x04, Y4, Y8 VPAND Y0, Y8, Y8 VBROADCASTI128 32(AX), Y9 VBROADCASTI128 96(AX), Y10 VPSHUFB Y6, Y9, Y9 VPSHUFB Y6, Y10, Y6 VPXOR Y7, Y9, Y7 VPXOR Y5, Y6, Y5 VBROADCASTI128 48(AX), Y9 VBROADCASTI128 112(AX), Y6 VPSHUFB Y8, Y9, Y9 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y7, Y9, Y1) XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R10), Y5 VMOVDQU 32(R10), Y6 VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y3) XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU Y1, (R8) VMOVDQU Y2, 32(R8) ADDQ $0x40, R8 VMOVDQU Y3, (R9) VMOVDQU Y4, 32(R9) ADDQ $0x40, R9 VMOVDQU Y5, (R10) VMOVDQU Y6, 32(R10) ADDQ $0x40, R10 VMOVDQU Y7, (DX) VMOVDQU Y8, 32(DX) ADDQ $0x40, DX SUBQ $0x40, SI JNZ loop VZEROUPPER RET // func fftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx2_2(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), CX MOVQ $0x0000000f, DX MOVQ DX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), DX MOVQ work_base+0(FP), BX MOVQ 8(BX), SI XORQ DI, DI MOVQ (BX)(DI*1), R8 ADDQ DX, DI MOVQ (BX)(DI*1), R9 ADDQ DX, DI MOVQ (BX)(DI*1), R10 ADDQ DX, DI MOVQ (BX)(DI*1), DX loop: VMOVDQU (R8), Y1 VMOVDQU 32(R8), Y2 VMOVDQU (R10), Y5 VMOVDQU 32(R10), Y6 VMOVDQU (R9), Y3 VMOVDQU 32(R9), Y4 VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y3) XOR3WAY( $0x00, Y9, Y10, Y4) VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (R8) VMOVDQU Y2, 32(R8) ADDQ $0x40, R8 VMOVDQU Y3, (R9) VMOVDQU Y4, 32(R9) ADDQ $0x40, R9 VPSRLQ $0x04, Y7, Y2 VPAND Y0, Y7, Y1 VPAND Y0, Y2, Y2 VBROADCASTI128 (AX), Y3 VBROADCASTI128 64(AX), Y4 VPSHUFB Y1, Y3, Y3 VPSHUFB Y1, Y4, Y1 VBROADCASTI128 16(AX), Y4 VBROADCASTI128 80(AX), Y9 VPSHUFB Y2, Y4, Y4 VPSHUFB Y2, Y9, Y2 VPXOR Y3, Y4, Y3 VPXOR Y1, Y2, Y1 VPAND Y8, Y0, Y2 VPSRLQ $0x04, Y8, Y4 VPAND Y0, Y4, Y4 VBROADCASTI128 32(AX), Y9 VBROADCASTI128 96(AX), Y10 VPSHUFB Y2, Y9, Y9 VPSHUFB Y2, Y10, Y2 VPXOR Y3, Y9, Y3 VPXOR Y1, Y2, Y1 VBROADCASTI128 48(AX), Y9 VBROADCASTI128 112(AX), Y2 VPSHUFB Y4, Y9, Y9 VPSHUFB Y4, Y2, Y2 XOR3WAY( $0x00, Y3, Y9, Y5) XOR3WAY( $0x00, Y1, Y2, Y6) VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R10) VMOVDQU Y6, 32(R10) ADDQ $0x40, R10 VMOVDQU Y7, (DX) VMOVDQU Y8, 32(DX) ADDQ $0x40, DX SUBQ $0x40, SI JNZ loop VZEROUPPER RET // func ifftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx2_3(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), AX MOVQ $0x0000000f, CX MOVQ CX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), CX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ CX, SI MOVQ (DX)(SI*1), R8 ADDQ CX, SI MOVQ (DX)(SI*1), R9 ADDQ CX, SI MOVQ (DX)(SI*1), CX loop: VMOVDQU (DI), Y1 VMOVDQU 32(DI), Y2 VMOVDQU (R8), Y3 VMOVDQU 32(R8), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU (R9), Y5 VMOVDQU 32(R9), Y6 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y3) XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU Y1, (DI) VMOVDQU Y2, 32(DI) ADDQ $0x40, DI VMOVDQU Y3, (R8) VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 VMOVDQU Y5, (R9) VMOVDQU Y6, 32(R9) ADDQ $0x40, R9 VMOVDQU Y7, (CX) VMOVDQU Y8, 32(CX) ADDQ $0x40, CX SUBQ $0x40, BX JNZ loop VZEROUPPER RET // func fftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx2_3(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), CX MOVQ $0x0000000f, CX MOVQ CX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), CX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ CX, SI MOVQ (DX)(SI*1), R8 ADDQ CX, SI MOVQ (DX)(SI*1), R9 ADDQ CX, SI MOVQ (DX)(SI*1), CX loop: VMOVDQU (DI), Y1 VMOVDQU 32(DI), Y2 VMOVDQU (R9), Y5 VMOVDQU 32(R9), Y6 VMOVDQU (R8), Y3 VMOVDQU 32(R8), Y4 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (DI) VMOVDQU Y2, 32(DI) ADDQ $0x40, DI VMOVDQU Y3, (R8) VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 VPSRLQ $0x04, Y7, Y2 VPAND Y0, Y7, Y1 VPAND Y0, Y2, Y2 VBROADCASTI128 (AX), Y3 VBROADCASTI128 64(AX), Y4 VPSHUFB Y1, Y3, Y3 VPSHUFB Y1, Y4, Y1 VBROADCASTI128 16(AX), Y4 VBROADCASTI128 80(AX), Y9 VPSHUFB Y2, Y4, Y4 VPSHUFB Y2, Y9, Y2 VPXOR Y3, Y4, Y3 VPXOR Y1, Y2, Y1 VPAND Y8, Y0, Y2 VPSRLQ $0x04, Y8, Y4 VPAND Y0, Y4, Y4 VBROADCASTI128 32(AX), Y9 VBROADCASTI128 96(AX), Y10 VPSHUFB Y2, Y9, Y9 VPSHUFB Y2, Y10, Y2 VPXOR Y3, Y9, Y3 VPXOR Y1, Y2, Y1 VBROADCASTI128 48(AX), Y9 VBROADCASTI128 112(AX), Y2 VPSHUFB Y4, Y9, Y9 VPSHUFB Y4, Y2, Y2 XOR3WAY( $0x00, Y3, Y9, Y5) XOR3WAY( $0x00, Y1, Y2, Y6) VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R9) VMOVDQU Y6, 32(R9) ADDQ $0x40, R9 VMOVDQU Y7, (CX) VMOVDQU Y8, 32(CX) ADDQ $0x40, CX SUBQ $0x40, BX JNZ loop VZEROUPPER RET // func ifftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx2_4(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), DX MOVQ $0x0000000f, DX MOVQ DX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), DX MOVQ work_base+0(FP), BX MOVQ 8(BX), SI XORQ DI, DI MOVQ (BX)(DI*1), R8 ADDQ DX, DI MOVQ (BX)(DI*1), R9 ADDQ DX, DI MOVQ (BX)(DI*1), R10 ADDQ DX, DI MOVQ (BX)(DI*1), DX loop: VMOVDQU (R8), Y1 VMOVDQU 32(R8), Y2 VMOVDQU (R9), Y3 VMOVDQU 32(R9), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VPSRLQ $0x04, Y3, Y6 VPAND Y0, Y3, Y5 VPAND Y0, Y6, Y6 VBROADCASTI128 (AX), Y7 VBROADCASTI128 64(AX), Y8 VPSHUFB Y5, Y7, Y7 VPSHUFB Y5, Y8, Y5 VBROADCASTI128 16(AX), Y8 VBROADCASTI128 80(AX), Y9 VPSHUFB Y6, Y8, Y8 VPSHUFB Y6, Y9, Y6 VPXOR Y7, Y8, Y7 VPXOR Y5, Y6, Y5 VPAND Y4, Y0, Y6 VPSRLQ $0x04, Y4, Y8 VPAND Y0, Y8, Y8 VBROADCASTI128 32(AX), Y9 VBROADCASTI128 96(AX), Y10 VPSHUFB Y6, Y9, Y9 VPSHUFB Y6, Y10, Y6 VPXOR Y7, Y9, Y7 VPXOR Y5, Y6, Y5 VBROADCASTI128 48(AX), Y9 VBROADCASTI128 112(AX), Y6 VPSHUFB Y8, Y9, Y9 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y7, Y9, Y1) XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R10), Y5 VMOVDQU 32(R10), Y6 VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y5) XOR3WAY( $0x00, Y9, Y10, Y6) VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VMOVDQU Y1, (R8) VMOVDQU Y2, 32(R8) ADDQ $0x40, R8 VMOVDQU Y3, (R9) VMOVDQU Y4, 32(R9) ADDQ $0x40, R9 VMOVDQU Y5, (R10) VMOVDQU Y6, 32(R10) ADDQ $0x40, R10 VMOVDQU Y7, (DX) VMOVDQU Y8, 32(DX) ADDQ $0x40, DX SUBQ $0x40, SI JNZ loop VZEROUPPER RET // func fftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx2_4(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), CX MOVQ $0x0000000f, DX MOVQ DX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), DX MOVQ work_base+0(FP), BX MOVQ 8(BX), SI XORQ DI, DI MOVQ (BX)(DI*1), R8 ADDQ DX, DI MOVQ (BX)(DI*1), R9 ADDQ DX, DI MOVQ (BX)(DI*1), R10 ADDQ DX, DI MOVQ (BX)(DI*1), DX loop: VMOVDQU (R8), Y1 VMOVDQU 32(R8), Y2 VMOVDQU (R10), Y5 VMOVDQU 32(R10), Y6 VMOVDQU (R9), Y3 VMOVDQU 32(R9), Y4 VMOVDQU (DX), Y7 VMOVDQU 32(DX), Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (CX), Y11 VBROADCASTI128 64(CX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(CX), Y12 VBROADCASTI128 80(CX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(CX), Y13 VBROADCASTI128 96(CX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(CX), Y13 VBROADCASTI128 112(CX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y3) XOR3WAY( $0x00, Y9, Y10, Y4) VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y3, Y10 VPAND Y0, Y3, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y4, Y0, Y10 VPSRLQ $0x04, Y4, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (R8) VMOVDQU Y2, 32(R8) ADDQ $0x40, R8 VMOVDQU Y3, (R9) VMOVDQU Y4, 32(R9) ADDQ $0x40, R9 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R10) VMOVDQU Y6, 32(R10) ADDQ $0x40, R10 VMOVDQU Y7, (DX) VMOVDQU Y8, 32(DX) ADDQ $0x40, DX SUBQ $0x40, SI JNZ loop VZEROUPPER RET // func ifftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx2_5(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), CX MOVQ $0x0000000f, CX MOVQ CX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), CX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ CX, SI MOVQ (DX)(SI*1), R8 ADDQ CX, SI MOVQ (DX)(SI*1), R9 ADDQ CX, SI MOVQ (DX)(SI*1), CX loop: VMOVDQU (DI), Y1 VMOVDQU 32(DI), Y2 VMOVDQU (R8), Y3 VMOVDQU 32(R8), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU (R9), Y5 VMOVDQU 32(R9), Y6 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y5) XOR3WAY( $0x00, Y9, Y10, Y6) VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VMOVDQU Y1, (DI) VMOVDQU Y2, 32(DI) ADDQ $0x40, DI VMOVDQU Y3, (R8) VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 VMOVDQU Y5, (R9) VMOVDQU Y6, 32(R9) ADDQ $0x40, R9 VMOVDQU Y7, (CX) VMOVDQU Y8, 32(CX) ADDQ $0x40, CX SUBQ $0x40, BX JNZ loop VZEROUPPER RET // func fftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx2_5(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), CX MOVQ $0x0000000f, CX MOVQ CX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), CX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ CX, SI MOVQ (DX)(SI*1), R8 ADDQ CX, SI MOVQ (DX)(SI*1), R9 ADDQ CX, SI MOVQ (DX)(SI*1), CX loop: VMOVDQU (DI), Y1 VMOVDQU 32(DI), Y2 VMOVDQU (R9), Y5 VMOVDQU 32(R9), Y6 VMOVDQU (R8), Y3 VMOVDQU 32(R8), Y4 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPSRLQ $0x04, Y3, Y10 VPAND Y0, Y3, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y4, Y0, Y10 VPSRLQ $0x04, Y4, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (DI) VMOVDQU Y2, 32(DI) ADDQ $0x40, DI VMOVDQU Y3, (R8) VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R9) VMOVDQU Y6, 32(R9) ADDQ $0x40, R9 VMOVDQU Y7, (CX) VMOVDQU Y8, 32(CX) ADDQ $0x40, CX SUBQ $0x40, BX JNZ loop VZEROUPPER RET // func ifftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT4_avx2_6(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), CX MOVQ table02+48(FP), CX MOVQ $0x0000000f, CX MOVQ CX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), CX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ CX, SI MOVQ (DX)(SI*1), R8 ADDQ CX, SI MOVQ (DX)(SI*1), R9 ADDQ CX, SI MOVQ (DX)(SI*1), CX loop: VMOVDQU (DI), Y1 VMOVDQU 32(DI), Y2 VMOVDQU (R8), Y3 VMOVDQU 32(R8), Y4 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VPSRLQ $0x04, Y3, Y6 VPAND Y0, Y3, Y5 VPAND Y0, Y6, Y6 VBROADCASTI128 (AX), Y7 VBROADCASTI128 64(AX), Y8 VPSHUFB Y5, Y7, Y7 VPSHUFB Y5, Y8, Y5 VBROADCASTI128 16(AX), Y8 VBROADCASTI128 80(AX), Y9 VPSHUFB Y6, Y8, Y8 VPSHUFB Y6, Y9, Y6 VPXOR Y7, Y8, Y7 VPXOR Y5, Y6, Y5 VPAND Y4, Y0, Y6 VPSRLQ $0x04, Y4, Y8 VPAND Y0, Y8, Y8 VBROADCASTI128 32(AX), Y9 VBROADCASTI128 96(AX), Y10 VPSHUFB Y6, Y9, Y9 VPSHUFB Y6, Y10, Y6 VPXOR Y7, Y9, Y7 VPXOR Y5, Y6, Y5 VBROADCASTI128 48(AX), Y9 VBROADCASTI128 112(AX), Y6 VPSHUFB Y8, Y9, Y9 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y7, Y9, Y1) XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R9), Y5 VMOVDQU 32(R9), Y6 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VMOVDQU Y1, (DI) VMOVDQU Y2, 32(DI) ADDQ $0x40, DI VMOVDQU Y3, (R8) VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 VMOVDQU Y5, (R9) VMOVDQU Y6, 32(R9) ADDQ $0x40, R9 VMOVDQU Y7, (CX) VMOVDQU Y8, 32(CX) ADDQ $0x40, CX SUBQ $0x40, BX JNZ loop VZEROUPPER RET // func fftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT4_avx2_6(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), AX MOVQ $0x0000000f, CX MOVQ CX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), CX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ CX, SI MOVQ (DX)(SI*1), R8 ADDQ CX, SI MOVQ (DX)(SI*1), R9 ADDQ CX, SI MOVQ (DX)(SI*1), CX loop: VMOVDQU (DI), Y1 VMOVDQU 32(DI), Y2 VMOVDQU (R9), Y5 VMOVDQU 32(R9), Y6 VMOVDQU (R8), Y3 VMOVDQU 32(R8), Y4 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSRLQ $0x04, Y5, Y10 VPAND Y0, Y5, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y6, Y0, Y10 VPSRLQ $0x04, Y6, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y1) XOR3WAY( $0x00, Y9, Y10, Y2) VPSRLQ $0x04, Y7, Y10 VPAND Y0, Y7, Y9 VPAND Y0, Y10, Y10 VBROADCASTI128 (AX), Y11 VBROADCASTI128 64(AX), Y12 VPSHUFB Y9, Y11, Y11 VPSHUFB Y9, Y12, Y9 VBROADCASTI128 16(AX), Y12 VBROADCASTI128 80(AX), Y13 VPSHUFB Y10, Y12, Y12 VPSHUFB Y10, Y13, Y10 VPXOR Y11, Y12, Y11 VPXOR Y9, Y10, Y9 VPAND Y8, Y0, Y10 VPSRLQ $0x04, Y8, Y12 VPAND Y0, Y12, Y12 VBROADCASTI128 32(AX), Y13 VBROADCASTI128 96(AX), Y14 VPSHUFB Y10, Y13, Y13 VPSHUFB Y10, Y14, Y10 VPXOR Y11, Y13, Y11 VPXOR Y9, Y10, Y9 VBROADCASTI128 48(AX), Y13 VBROADCASTI128 112(AX), Y10 VPSHUFB Y12, Y13, Y13 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y11, Y13, Y3) XOR3WAY( $0x00, Y9, Y10, Y4) VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y1, Y3, Y3 VPXOR Y2, Y4, Y4 VMOVDQU Y1, (DI) VMOVDQU Y2, 32(DI) ADDQ $0x40, DI VMOVDQU Y3, (R8) VMOVDQU Y4, 32(R8) ADDQ $0x40, R8 VPXOR Y5, Y7, Y7 VPXOR Y6, Y8, Y8 VMOVDQU Y5, (R9) VMOVDQU Y6, 32(R9) ADDQ $0x40, R9 VMOVDQU Y7, (CX) VMOVDQU Y8, 32(CX) ADDQ $0x40, CX SUBQ $0x40, BX JNZ loop VZEROUPPER RET // func ifftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·ifftDIT4_avx2_7(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), AX MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y1 VMOVDQU (DI), Y2 VMOVDQU 32(DI), Y3 VPXOR Y0, Y2, Y2 VPXOR Y1, Y3, Y3 VMOVDQU (R8), Y4 VMOVDQU 32(R8), Y5 VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y7 VPXOR Y4, Y6, Y6 VPXOR Y5, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI VMOVDQU Y4, (R8) VMOVDQU Y5, 32(R8) ADDQ $0x40, R8 VMOVDQU Y6, (AX) VMOVDQU Y7, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func fftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·fftDIT4_avx2_7(SB), NOSPLIT, $0-56 // dist must be multiplied by 24 (size of slice header) MOVQ table01+32(FP), AX MOVQ table23+40(FP), AX MOVQ table02+48(FP), AX MOVQ $0x0000000f, AX MOVQ AX, X0 VPBROADCASTB X0, Y0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y1 VMOVDQU (R8), Y4 VMOVDQU 32(R8), Y5 VMOVDQU (DI), Y2 VMOVDQU 32(DI), Y3 VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y7 VPXOR Y0, Y4, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VPXOR Y0, Y2, Y2 VPXOR Y1, Y3, Y3 VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI VPXOR Y4, Y6, Y6 VPXOR Y5, Y7, Y7 VMOVDQU Y4, (R8) VMOVDQU Y5, 32(R8) ADDQ $0x40, R8 VMOVDQU Y6, (AX) VMOVDQU Y7, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JNZ loop VZEROUPPER RET // func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) // Requires: SSE, SSE2, SSSE3 TEXT ·ifftDIT2_ssse3(SB), NOSPLIT, $0-56 MOVQ table+48(FP), AX MOVUPS (AX), X0 MOVUPS 64(AX), X1 MOVUPS 16(AX), X2 MOVUPS 80(AX), X3 MOVUPS 32(AX), X4 MOVUPS 96(AX), X5 XORPS X6, X6 MOVQ $0x0000000f, CX MOVQ CX, X7 PSHUFB X6, X7 MOVQ x_len+8(FP), CX MOVQ x_base+0(FP), DX MOVQ y_base+24(FP), BX loop: MOVUPS (DX), X6 MOVUPS 32(DX), X8 MOVUPS (BX), X9 MOVUPS 32(BX), X10 PXOR X6, X9 PXOR X8, X10 MOVUPS X9, (BX) MOVUPS X10, 32(BX) MOVAPS X9, X11 PSRLQ $0x04, X11 MOVAPS X9, X9 PAND X7, X9 PAND X7, X11 MOVUPS X0, X12 MOVUPS X1, X13 PSHUFB X9, X12 PSHUFB X9, X13 MOVUPS X2, X9 MOVUPS X3, X14 PSHUFB X11, X9 PSHUFB X11, X14 PXOR X9, X12 PXOR X14, X13 MOVAPS X10, X9 MOVAPS X10, X10 PAND X7, X9 PSRLQ $0x04, X10 PAND X7, X10 MOVUPS X4, X11 MOVUPS X5, X14 PSHUFB X9, X11 PSHUFB X9, X14 PXOR X11, X12 PXOR X14, X13 MOVUPS 48(AX), X11 MOVUPS 112(AX), X14 PSHUFB X10, X11 PSHUFB X10, X14 PXOR X11, X12 PXOR X14, X13 PXOR X12, X6 PXOR X13, X8 MOVUPS X6, (DX) MOVUPS X8, 32(DX) MOVUPS 16(DX), X6 MOVUPS 48(DX), X8 MOVUPS 16(BX), X9 MOVUPS 48(BX), X10 PXOR X6, X9 PXOR X8, X10 MOVUPS X9, 16(BX) MOVUPS X10, 48(BX) MOVAPS X9, X11 PSRLQ $0x04, X11 MOVAPS X9, X9 PAND X7, X9 PAND X7, X11 MOVUPS X0, X12 MOVUPS X1, X13 PSHUFB X9, X12 PSHUFB X9, X13 MOVUPS X2, X9 MOVUPS X3, X14 PSHUFB X11, X9 PSHUFB X11, X14 PXOR X9, X12 PXOR X14, X13 MOVAPS X10, X9 MOVAPS X10, X10 PAND X7, X9 PSRLQ $0x04, X10 PAND X7, X10 MOVUPS X4, X11 MOVUPS X5, X14 PSHUFB X9, X11 PSHUFB X9, X14 PXOR X11, X12 PXOR X14, X13 MOVUPS 48(AX), X11 MOVUPS 112(AX), X14 PSHUFB X10, X11 PSHUFB X10, X14 PXOR X11, X12 PXOR X14, X13 PXOR X12, X6 PXOR X13, X8 MOVUPS X6, 16(DX) MOVUPS X8, 48(DX) ADDQ $0x40, DX ADDQ $0x40, BX SUBQ $0x40, CX JNZ loop RET // func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) // Requires: SSE, SSE2, SSSE3 TEXT ·fftDIT2_ssse3(SB), NOSPLIT, $0-56 MOVQ table+48(FP), AX MOVUPS (AX), X0 MOVUPS 64(AX), X1 MOVUPS 16(AX), X2 MOVUPS 80(AX), X3 MOVUPS 32(AX), X4 MOVUPS 96(AX), X5 XORPS X6, X6 MOVQ $0x0000000f, CX MOVQ CX, X7 PSHUFB X6, X7 MOVQ x_len+8(FP), CX MOVQ x_base+0(FP), DX MOVQ y_base+24(FP), BX loop: MOVUPS (BX), X9 MOVUPS 32(BX), X10 MOVAPS X9, X8 PSRLQ $0x04, X8 MOVAPS X9, X6 PAND X7, X6 PAND X7, X8 MOVUPS X0, X11 MOVUPS X1, X12 PSHUFB X6, X11 PSHUFB X6, X12 MOVUPS X2, X6 MOVUPS X3, X13 PSHUFB X8, X6 PSHUFB X8, X13 PXOR X6, X11 PXOR X13, X12 MOVAPS X10, X6 MOVAPS X10, X8 PAND X7, X6 PSRLQ $0x04, X8 PAND X7, X8 MOVUPS X4, X13 MOVUPS X5, X14 PSHUFB X6, X13 PSHUFB X6, X14 PXOR X13, X11 PXOR X14, X12 MOVUPS 48(AX), X13 MOVUPS 112(AX), X14 PSHUFB X8, X13 PSHUFB X8, X14 PXOR X13, X11 PXOR X14, X12 MOVUPS (DX), X6 MOVUPS 32(DX), X8 PXOR X11, X6 PXOR X12, X8 MOVUPS X6, (DX) MOVUPS X8, 32(DX) PXOR X6, X9 PXOR X8, X10 MOVUPS X9, (BX) MOVUPS X10, 32(BX) MOVUPS 16(BX), X9 MOVUPS 48(BX), X10 MOVAPS X9, X8 PSRLQ $0x04, X8 MOVAPS X9, X6 PAND X7, X6 PAND X7, X8 MOVUPS X0, X11 MOVUPS X1, X12 PSHUFB X6, X11 PSHUFB X6, X12 MOVUPS X2, X6 MOVUPS X3, X13 PSHUFB X8, X6 PSHUFB X8, X13 PXOR X6, X11 PXOR X13, X12 MOVAPS X10, X6 MOVAPS X10, X8 PAND X7, X6 PSRLQ $0x04, X8 PAND X7, X8 MOVUPS X4, X13 MOVUPS X5, X14 PSHUFB X6, X13 PSHUFB X6, X14 PXOR X13, X11 PXOR X14, X12 MOVUPS 48(AX), X13 MOVUPS 112(AX), X14 PSHUFB X8, X13 PSHUFB X8, X14 PXOR X13, X11 PXOR X14, X12 MOVUPS 16(DX), X6 MOVUPS 48(DX), X8 PXOR X11, X6 PXOR X12, X8 MOVUPS X6, 16(DX) MOVUPS X8, 48(DX) PXOR X6, X9 PXOR X8, X10 MOVUPS X9, 16(BX) MOVUPS X10, 48(BX) ADDQ $0x40, DX ADDQ $0x40, BX SUBQ $0x40, CX JNZ loop RET // func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8) // Requires: SSE, SSE2, SSSE3 TEXT ·mulgf16_ssse3(SB), NOSPLIT, $0-56 MOVQ table+48(FP), AX MOVUPS (AX), X0 MOVUPS 64(AX), X1 MOVUPS 16(AX), X2 MOVUPS 80(AX), X3 MOVUPS 32(AX), X4 MOVUPS 96(AX), X5 MOVUPS 48(AX), X6 MOVUPS 112(AX), X7 MOVQ x_len+8(FP), AX MOVQ x_base+0(FP), CX MOVQ y_base+24(FP), DX XORPS X8, X8 MOVQ $0x0000000f, BX MOVQ BX, X9 PSHUFB X8, X9 loop: MOVUPS (DX), X8 MOVUPS 32(DX), X10 MOVAPS X8, X11 PSRLQ $0x04, X11 MOVAPS X8, X8 PAND X9, X8 PAND X9, X11 MOVUPS X0, X12 MOVUPS X1, X13 PSHUFB X8, X12 PSHUFB X8, X13 MOVUPS X2, X8 MOVUPS X3, X14 PSHUFB X11, X8 PSHUFB X11, X14 PXOR X8, X12 PXOR X14, X13 MOVAPS X10, X8 MOVAPS X10, X10 PAND X9, X8 PSRLQ $0x04, X10 PAND X9, X10 MOVUPS X4, X11 MOVUPS X5, X14 PSHUFB X8, X11 PSHUFB X8, X14 PXOR X11, X12 PXOR X14, X13 MOVUPS X6, X11 MOVUPS X7, X14 PSHUFB X10, X11 PSHUFB X10, X14 PXOR X11, X12 PXOR X14, X13 MOVUPS X12, (CX) MOVUPS X13, 32(CX) MOVUPS 16(DX), X8 MOVUPS 48(DX), X10 MOVAPS X8, X11 PSRLQ $0x04, X11 MOVAPS X8, X8 PAND X9, X8 PAND X9, X11 MOVUPS X0, X12 MOVUPS X1, X13 PSHUFB X8, X12 PSHUFB X8, X13 MOVUPS X2, X8 MOVUPS X3, X14 PSHUFB X11, X8 PSHUFB X11, X14 PXOR X8, X12 PXOR X14, X13 MOVAPS X10, X8 MOVAPS X10, X10 PAND X9, X8 PSRLQ $0x04, X10 PAND X9, X10 MOVUPS X4, X11 MOVUPS X5, X14 PSHUFB X8, X11 PSHUFB X8, X14 PXOR X11, X12 PXOR X14, X13 MOVUPS X6, X11 MOVUPS X7, X14 PSHUFB X10, X11 PSHUFB X10, X14 PXOR X11, X12 PXOR X14, X13 MOVUPS X12, 16(CX) MOVUPS X13, 48(CX) ADDQ $0x40, CX ADDQ $0x40, DX SUBQ $0x40, AX JNZ loop RET // func ifftDIT28_avx2(x []byte, y []byte, table *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT28_avx2(SB), NOSPLIT, $0-56 MOVQ table+48(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ x_len+8(FP), AX MOVQ x_base+0(FP), CX MOVQ y_base+24(FP), DX MOVQ $0x0000000f, BX MOVQ BX, X2 VPBROADCASTB X2, Y2 loop: VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VMOVDQU (DX), Y5 VMOVDQU 32(DX), Y6 VPXOR Y5, Y3, Y5 VPXOR Y6, Y4, Y6 VMOVDQU Y5, (DX) VMOVDQU Y6, 32(DX) // LEO_MULADD_256 VPAND Y5, Y2, Y7 VPSRLQ $0x04, Y5, Y5 VPSHUFB Y7, Y0, Y7 VPAND Y5, Y2, Y5 VPSHUFB Y5, Y1, Y5 XOR3WAY( $0x00, Y7, Y5, Y3) // LEO_MULADD_256 VPAND Y6, Y2, Y5 VPSRLQ $0x04, Y6, Y6 VPSHUFB Y5, Y0, Y5 VPAND Y6, Y2, Y6 VPSHUFB Y6, Y1, Y6 XOR3WAY( $0x00, Y5, Y6, Y4) VMOVDQU Y3, (CX) VMOVDQU Y4, 32(CX) ADDQ $0x40, CX ADDQ $0x40, DX SUBQ $0x40, AX JA loop VZEROUPPER RET // func fftDIT28_avx2(x []byte, y []byte, table *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT28_avx2(SB), NOSPLIT, $0-56 MOVQ table+48(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ x_len+8(FP), AX MOVQ x_base+0(FP), CX MOVQ y_base+24(FP), DX MOVQ $0x0000000f, BX MOVQ BX, X2 VPBROADCASTB X2, Y2 loop: VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VMOVDQU (DX), Y5 VMOVDQU 32(DX), Y6 // LEO_MULADD_256 VPAND Y5, Y2, Y7 VPSRLQ $0x04, Y5, Y8 VPSHUFB Y7, Y0, Y7 VPAND Y8, Y2, Y8 VPSHUFB Y8, Y1, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) // LEO_MULADD_256 VPAND Y6, Y2, Y7 VPSRLQ $0x04, Y6, Y8 VPSHUFB Y7, Y0, Y7 VPAND Y8, Y2, Y8 VPSHUFB Y8, Y1, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU Y3, (CX) VMOVDQU Y4, 32(CX) VPXOR Y5, Y3, Y5 VPXOR Y6, Y4, Y6 VMOVDQU Y5, (DX) VMOVDQU Y6, 32(DX) ADDQ $0x40, CX ADDQ $0x40, DX SUBQ $0x40, AX JA loop VZEROUPPER RET // func ifftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_0(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 16(AX), Y0 MOVQ t23+40(FP), CX VBROADCASTI128 (CX), Y1 VBROADCASTI128 16(CX), Y2 MOVQ t02+48(FP), CX VBROADCASTI128 (CX), Y3 VBROADCASTI128 16(CX), Y4 MOVQ dist+24(FP), CX MOVQ work_base+0(FP), DX MOVQ 8(DX), BX XORQ SI, SI MOVQ (DX)(SI*1), DI ADDQ CX, SI MOVQ (DX)(SI*1), R8 ADDQ CX, SI MOVQ (DX)(SI*1), R9 ADDQ CX, SI MOVQ (DX)(SI*1), CX MOVQ $0x0000000f, DX MOVQ DX, X5 VPBROADCASTB X5, Y5 loop: VMOVDQU (DI), Y6 VMOVDQU (R8), Y7 VMOVDQU 32(DI), Y8 VMOVDQU 32(R8), Y9 VPXOR Y7, Y6, Y7 VPXOR Y9, Y8, Y9 VBROADCASTI128 (AX), Y10 // LEO_MULADD_256 VPAND Y7, Y5, Y11 VPSRLQ $0x04, Y7, Y12 VPSHUFB Y11, Y10, Y11 VPAND Y12, Y5, Y12 VPSHUFB Y12, Y0, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) // LEO_MULADD_256 VPAND Y9, Y5, Y11 VPSRLQ $0x04, Y9, Y12 VPSHUFB Y11, Y10, Y11 VPAND Y12, Y5, Y12 VPSHUFB Y12, Y0, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU (R9), Y10 VMOVDQU (CX), Y11 VMOVDQU 32(R9), Y12 VMOVDQU 32(CX), Y13 VPXOR Y10, Y11, Y11 VPXOR Y12, Y13, Y13 // LEO_MULADD_256 VPAND Y11, Y5, Y14 VPSRLQ $0x04, Y11, Y15 VPSHUFB Y14, Y1, Y14 VPAND Y15, Y5, Y15 VPSHUFB Y15, Y2, Y15 XOR3WAY( $0x00, Y14, Y15, Y10) // LEO_MULADD_256 VPAND Y13, Y5, Y14 VPSRLQ $0x04, Y13, Y15 VPSHUFB Y14, Y1, Y14 VPAND Y15, Y5, Y15 VPSHUFB Y15, Y2, Y15 XOR3WAY( $0x00, Y14, Y15, Y12) VPXOR Y6, Y10, Y10 VPXOR Y7, Y11, Y11 VPXOR Y8, Y12, Y12 VPXOR Y9, Y13, Y13 // LEO_MULADD_256 VPAND Y10, Y5, Y14 VPSRLQ $0x04, Y10, Y15 VPSHUFB Y14, Y3, Y14 VPAND Y15, Y5, Y15 VPSHUFB Y15, Y4, Y15 XOR3WAY( $0x00, Y14, Y15, Y6) // LEO_MULADD_256 VPAND Y11, Y5, Y14 VPSRLQ $0x04, Y11, Y15 VPSHUFB Y14, Y3, Y14 VPAND Y15, Y5, Y15 VPSHUFB Y15, Y4, Y15 XOR3WAY( $0x00, Y14, Y15, Y7) // LEO_MULADD_256 VPAND Y12, Y5, Y14 VPSRLQ $0x04, Y12, Y15 VPSHUFB Y14, Y3, Y14 VPAND Y15, Y5, Y15 VPSHUFB Y15, Y4, Y15 XOR3WAY( $0x00, Y14, Y15, Y8) // LEO_MULADD_256 VPAND Y13, Y5, Y14 VPSRLQ $0x04, Y13, Y15 VPSHUFB Y14, Y3, Y14 VPAND Y15, Y5, Y15 VPSHUFB Y15, Y4, Y15 XOR3WAY( $0x00, Y14, Y15, Y9) VMOVDQU Y6, (DI) VMOVDQU Y8, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) VMOVDQU Y9, 32(R8) ADDQ $0x40, R8 VMOVDQU Y10, (R9) VMOVDQU Y12, 32(R9) ADDQ $0x40, R9 VMOVDQU Y11, (CX) VMOVDQU Y13, 32(CX) ADDQ $0x40, CX SUBQ $0x40, BX JA loop VZEROUPPER RET // func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_0(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 16(AX), Y0 MOVQ t23+40(FP), CX VBROADCASTI128 16(CX), Y1 MOVQ t02+48(FP), DX VBROADCASTI128 (DX), Y2 VBROADCASTI128 16(DX), Y3 MOVQ dist+24(FP), DX MOVQ work_base+0(FP), BX MOVQ 8(BX), SI XORQ DI, DI MOVQ (BX)(DI*1), R8 ADDQ DX, DI MOVQ (BX)(DI*1), R9 ADDQ DX, DI MOVQ (BX)(DI*1), R10 ADDQ DX, DI MOVQ (BX)(DI*1), DX MOVQ $0x0000000f, BX MOVQ BX, X4 VPBROADCASTB X4, Y4 loop: VMOVDQU (R8), Y5 VMOVDQU 32(R8), Y6 VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y10 VMOVDQU (R9), Y7 VMOVDQU 32(R9), Y8 VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y12 // LEO_MULADD_256 VPAND Y9, Y4, Y13 VPSRLQ $0x04, Y9, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 VPAND Y10, Y4, Y13 VPSRLQ $0x04, Y10, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) // LEO_MULADD_256 VPAND Y11, Y4, Y13 VPSRLQ $0x04, Y11, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y7) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y8) VPXOR Y5, Y9, Y9 VPXOR Y7, Y11, Y11 VPXOR Y6, Y10, Y10 VPXOR Y8, Y12, Y12 VBROADCASTI128 (AX), Y13 // LEO_MULADD_256 VPAND Y7, Y4, Y14 VPSRLQ $0x04, Y7, Y15 VPSHUFB Y14, Y13, Y14 VPAND Y15, Y4, Y15 VPSHUFB Y15, Y0, Y15 XOR3WAY( $0x00, Y14, Y15, Y5) // LEO_MULADD_256 VPAND Y8, Y4, Y14 VPSRLQ $0x04, Y8, Y15 VPSHUFB Y14, Y13, Y14 VPAND Y15, Y4, Y15 VPSHUFB Y15, Y0, Y15 XOR3WAY( $0x00, Y14, Y15, Y6) VPXOR Y7, Y5, Y7 VPXOR Y8, Y6, Y8 VBROADCASTI128 (CX), Y13 // LEO_MULADD_256 VPAND Y11, Y4, Y14 VPSRLQ $0x04, Y11, Y15 VPSHUFB Y14, Y13, Y14 VPAND Y15, Y4, Y15 VPSHUFB Y15, Y1, Y15 XOR3WAY( $0x00, Y14, Y15, Y9) // LEO_MULADD_256 VPAND Y12, Y4, Y14 VPSRLQ $0x04, Y12, Y15 VPSHUFB Y14, Y13, Y14 VPAND Y15, Y4, Y15 VPSHUFB Y15, Y1, Y15 XOR3WAY( $0x00, Y14, Y15, Y10) VPXOR Y9, Y11, Y11 VPXOR Y10, Y12, Y12 VMOVDQU Y5, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y7, (R9) VMOVDQU Y8, 32(R9) ADDQ $0x40, R9 VMOVDQU Y9, (R10) VMOVDQU Y10, 32(R10) ADDQ $0x40, R10 VMOVDQU Y11, (DX) VMOVDQU Y12, 32(DX) ADDQ $0x40, DX SUBQ $0x40, SI JA loop VZEROUPPER RET // func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_1(SB), NOSPLIT, $0-56 MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ t02+48(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X4 VPBROADCASTB X4, Y4 loop: VMOVDQU (SI), Y5 VMOVDQU (DI), Y6 VMOVDQU 32(SI), Y7 VMOVDQU 32(DI), Y8 VPXOR Y6, Y5, Y6 VPXOR Y8, Y7, Y8 VMOVDQU (R8), Y9 VMOVDQU (AX), Y10 VMOVDQU 32(R8), Y11 VMOVDQU 32(AX), Y12 VPXOR Y9, Y10, Y10 VPXOR Y11, Y12, Y12 // LEO_MULADD_256 VPAND Y10, Y4, Y13 VPSRLQ $0x04, Y10, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y9) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y11) VPXOR Y5, Y9, Y9 VPXOR Y6, Y10, Y10 VPXOR Y7, Y11, Y11 VPXOR Y8, Y12, Y12 // LEO_MULADD_256 VPAND Y9, Y4, Y13 VPSRLQ $0x04, Y9, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 VPAND Y10, Y4, Y13 VPSRLQ $0x04, Y10, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) // LEO_MULADD_256 VPAND Y11, Y4, Y13 VPSRLQ $0x04, Y11, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y7) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y8) VMOVDQU Y5, (SI) VMOVDQU Y7, 32(SI) ADDQ $0x40, SI VMOVDQU Y6, (DI) VMOVDQU Y8, 32(DI) ADDQ $0x40, DI VMOVDQU Y9, (R8) VMOVDQU Y11, 32(R8) ADDQ $0x40, R8 VMOVDQU Y10, (AX) VMOVDQU Y12, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_1(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X4 VPBROADCASTB X4, Y4 loop: VMOVDQU (SI), Y5 VMOVDQU 32(SI), Y6 VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y10 VMOVDQU (DI), Y7 VMOVDQU 32(DI), Y8 VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y12 VPXOR Y5, Y9, Y9 VPXOR Y7, Y11, Y11 VPXOR Y6, Y10, Y10 VPXOR Y8, Y12, Y12 // LEO_MULADD_256 VPAND Y7, Y4, Y13 VPSRLQ $0x04, Y7, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 VPAND Y8, Y4, Y13 VPSRLQ $0x04, Y8, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) VPXOR Y7, Y5, Y7 VPXOR Y8, Y6, Y8 // LEO_MULADD_256 VPAND Y11, Y4, Y13 VPSRLQ $0x04, Y11, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y9) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y10) VPXOR Y9, Y11, Y11 VPXOR Y10, Y12, Y12 VMOVDQU Y5, (SI) VMOVDQU Y6, 32(SI) ADDQ $0x40, SI VMOVDQU Y7, (DI) VMOVDQU Y8, 32(DI) ADDQ $0x40, DI VMOVDQU Y9, (R8) VMOVDQU Y10, 32(R8) ADDQ $0x40, R8 VMOVDQU Y11, (AX) VMOVDQU Y12, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_2(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ t02+48(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X4 VPBROADCASTB X4, Y4 loop: VMOVDQU (SI), Y5 VMOVDQU (DI), Y6 VMOVDQU 32(SI), Y7 VMOVDQU 32(DI), Y8 VPXOR Y6, Y5, Y6 VPXOR Y8, Y7, Y8 // LEO_MULADD_256 VPAND Y6, Y4, Y9 VPSRLQ $0x04, Y6, Y10 VPSHUFB Y9, Y0, Y9 VPAND Y10, Y4, Y10 VPSHUFB Y10, Y1, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) // LEO_MULADD_256 VPAND Y8, Y4, Y9 VPSRLQ $0x04, Y8, Y10 VPSHUFB Y9, Y0, Y9 VPAND Y10, Y4, Y10 VPSHUFB Y10, Y1, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) VMOVDQU (R8), Y9 VMOVDQU (AX), Y10 VMOVDQU 32(R8), Y11 VMOVDQU 32(AX), Y12 VPXOR Y9, Y10, Y10 VPXOR Y11, Y12, Y12 VPXOR Y5, Y9, Y9 VPXOR Y6, Y10, Y10 VPXOR Y7, Y11, Y11 VPXOR Y8, Y12, Y12 // LEO_MULADD_256 VPAND Y9, Y4, Y13 VPSRLQ $0x04, Y9, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 VPAND Y10, Y4, Y13 VPSRLQ $0x04, Y10, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) // LEO_MULADD_256 VPAND Y11, Y4, Y13 VPSRLQ $0x04, Y11, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y7) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y8) VMOVDQU Y5, (SI) VMOVDQU Y7, 32(SI) ADDQ $0x40, SI VMOVDQU Y6, (DI) VMOVDQU Y8, 32(DI) ADDQ $0x40, DI VMOVDQU Y9, (R8) VMOVDQU Y11, 32(R8) ADDQ $0x40, R8 VMOVDQU Y10, (AX) VMOVDQU Y12, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_2(SB), NOSPLIT, $0-56 MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ t02+48(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X4 VPBROADCASTB X4, Y4 loop: VMOVDQU (SI), Y5 VMOVDQU 32(SI), Y6 VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y10 VMOVDQU (DI), Y7 VMOVDQU 32(DI), Y8 VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y12 // LEO_MULADD_256 VPAND Y9, Y4, Y13 VPSRLQ $0x04, Y9, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 VPAND Y10, Y4, Y13 VPSRLQ $0x04, Y10, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) // LEO_MULADD_256 VPAND Y11, Y4, Y13 VPSRLQ $0x04, Y11, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y7) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y8) VPXOR Y5, Y9, Y9 VPXOR Y7, Y11, Y11 VPXOR Y6, Y10, Y10 VPXOR Y8, Y12, Y12 VPXOR Y7, Y5, Y7 VPXOR Y8, Y6, Y8 // LEO_MULADD_256 VPAND Y11, Y4, Y13 VPSRLQ $0x04, Y11, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y9) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y10) VPXOR Y9, Y11, Y11 VPXOR Y10, Y12, Y12 VMOVDQU Y5, (SI) VMOVDQU Y6, 32(SI) ADDQ $0x40, SI VMOVDQU Y7, (DI) VMOVDQU Y8, 32(DI) ADDQ $0x40, DI VMOVDQU Y9, (R8) VMOVDQU Y10, 32(R8) ADDQ $0x40, R8 VMOVDQU Y11, (AX) VMOVDQU Y12, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_3(SB), NOSPLIT, $0-56 MOVQ t02+48(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X2 VPBROADCASTB X2, Y2 loop: VMOVDQU (SI), Y3 VMOVDQU (DI), Y4 VMOVDQU 32(SI), Y5 VMOVDQU 32(DI), Y6 VPXOR Y4, Y3, Y4 VPXOR Y6, Y5, Y6 VMOVDQU (R8), Y7 VMOVDQU (AX), Y8 VMOVDQU 32(R8), Y9 VMOVDQU 32(AX), Y10 VPXOR Y7, Y8, Y8 VPXOR Y9, Y10, Y10 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y5, Y9, Y9 VPXOR Y6, Y10, Y10 // LEO_MULADD_256 VPAND Y7, Y2, Y11 VPSRLQ $0x04, Y7, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) // LEO_MULADD_256 VPAND Y8, Y2, Y11 VPSRLQ $0x04, Y8, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) // LEO_MULADD_256 VPAND Y9, Y2, Y11 VPSRLQ $0x04, Y9, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) // LEO_MULADD_256 VPAND Y10, Y2, Y11 VPSRLQ $0x04, Y10, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU Y3, (SI) VMOVDQU Y5, 32(SI) ADDQ $0x40, SI VMOVDQU Y4, (DI) VMOVDQU Y6, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) VMOVDQU Y9, 32(R8) ADDQ $0x40, R8 VMOVDQU Y8, (AX) VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_3(SB), NOSPLIT, $0-56 MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X2 VPBROADCASTB X2, Y2 loop: VMOVDQU (SI), Y3 VMOVDQU 32(SI), Y4 VMOVDQU (R8), Y7 VMOVDQU 32(R8), Y8 VMOVDQU (DI), Y5 VMOVDQU 32(DI), Y6 VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y10 VPXOR Y3, Y7, Y7 VPXOR Y5, Y9, Y9 VPXOR Y4, Y8, Y8 VPXOR Y6, Y10, Y10 VPXOR Y5, Y3, Y5 VPXOR Y6, Y4, Y6 // LEO_MULADD_256 VPAND Y9, Y2, Y11 VPSRLQ $0x04, Y9, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) // LEO_MULADD_256 VPAND Y10, Y2, Y11 VPSRLQ $0x04, Y10, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) VPXOR Y7, Y9, Y9 VPXOR Y8, Y10, Y10 VMOVDQU Y3, (SI) VMOVDQU Y4, 32(SI) ADDQ $0x40, SI VMOVDQU Y5, (DI) VMOVDQU Y6, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) VMOVDQU Y8, 32(R8) ADDQ $0x40, R8 VMOVDQU Y9, (AX) VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_4(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X4 VPBROADCASTB X4, Y4 loop: VMOVDQU (SI), Y5 VMOVDQU (DI), Y6 VMOVDQU 32(SI), Y7 VMOVDQU 32(DI), Y8 VPXOR Y6, Y5, Y6 VPXOR Y8, Y7, Y8 // LEO_MULADD_256 VPAND Y6, Y4, Y9 VPSRLQ $0x04, Y6, Y10 VPSHUFB Y9, Y0, Y9 VPAND Y10, Y4, Y10 VPSHUFB Y10, Y1, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) // LEO_MULADD_256 VPAND Y8, Y4, Y9 VPSRLQ $0x04, Y8, Y10 VPSHUFB Y9, Y0, Y9 VPAND Y10, Y4, Y10 VPSHUFB Y10, Y1, Y10 XOR3WAY( $0x00, Y9, Y10, Y7) VMOVDQU (R8), Y9 VMOVDQU (AX), Y10 VMOVDQU 32(R8), Y11 VMOVDQU 32(AX), Y12 VPXOR Y9, Y10, Y10 VPXOR Y11, Y12, Y12 // LEO_MULADD_256 VPAND Y10, Y4, Y13 VPSRLQ $0x04, Y10, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y9) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y11) VPXOR Y5, Y9, Y9 VPXOR Y6, Y10, Y10 VPXOR Y7, Y11, Y11 VPXOR Y8, Y12, Y12 VMOVDQU Y5, (SI) VMOVDQU Y7, 32(SI) ADDQ $0x40, SI VMOVDQU Y6, (DI) VMOVDQU Y8, 32(DI) ADDQ $0x40, DI VMOVDQU Y9, (R8) VMOVDQU Y11, 32(R8) ADDQ $0x40, R8 VMOVDQU Y10, (AX) VMOVDQU Y12, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_4(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ t02+48(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X4 VPBROADCASTB X4, Y4 loop: VMOVDQU (SI), Y5 VMOVDQU 32(SI), Y6 VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y10 VMOVDQU (DI), Y7 VMOVDQU 32(DI), Y8 VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y12 // LEO_MULADD_256 VPAND Y9, Y4, Y13 VPSRLQ $0x04, Y9, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 VPAND Y10, Y4, Y13 VPSRLQ $0x04, Y10, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) // LEO_MULADD_256 VPAND Y11, Y4, Y13 VPSRLQ $0x04, Y11, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y7) // LEO_MULADD_256 VPAND Y12, Y4, Y13 VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y8) VPXOR Y5, Y9, Y9 VPXOR Y7, Y11, Y11 VPXOR Y6, Y10, Y10 VPXOR Y8, Y12, Y12 // LEO_MULADD_256 VPAND Y7, Y4, Y13 VPSRLQ $0x04, Y7, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 VPAND Y8, Y4, Y13 VPSRLQ $0x04, Y8, Y14 VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) VPXOR Y7, Y5, Y7 VPXOR Y8, Y6, Y8 VPXOR Y9, Y11, Y11 VPXOR Y10, Y12, Y12 VMOVDQU Y5, (SI) VMOVDQU Y6, 32(SI) ADDQ $0x40, SI VMOVDQU Y7, (DI) VMOVDQU Y8, 32(DI) ADDQ $0x40, DI VMOVDQU Y9, (R8) VMOVDQU Y10, 32(R8) ADDQ $0x40, R8 VMOVDQU Y11, (AX) VMOVDQU Y12, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_5(SB), NOSPLIT, $0-56 MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X2 VPBROADCASTB X2, Y2 loop: VMOVDQU (SI), Y3 VMOVDQU (DI), Y4 VMOVDQU 32(SI), Y5 VMOVDQU 32(DI), Y6 VPXOR Y4, Y3, Y4 VPXOR Y6, Y5, Y6 VMOVDQU (R8), Y7 VMOVDQU (AX), Y8 VMOVDQU 32(R8), Y9 VMOVDQU 32(AX), Y10 VPXOR Y7, Y8, Y8 VPXOR Y9, Y10, Y10 // LEO_MULADD_256 VPAND Y8, Y2, Y11 VPSRLQ $0x04, Y8, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) // LEO_MULADD_256 VPAND Y10, Y2, Y11 VPSRLQ $0x04, Y10, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y9) VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y5, Y9, Y9 VPXOR Y6, Y10, Y10 VMOVDQU Y3, (SI) VMOVDQU Y5, 32(SI) ADDQ $0x40, SI VMOVDQU Y4, (DI) VMOVDQU Y6, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) VMOVDQU Y9, 32(R8) ADDQ $0x40, R8 VMOVDQU Y8, (AX) VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_5(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X2 VPBROADCASTB X2, Y2 loop: VMOVDQU (SI), Y3 VMOVDQU 32(SI), Y4 VMOVDQU (R8), Y7 VMOVDQU 32(R8), Y8 VMOVDQU (DI), Y5 VMOVDQU 32(DI), Y6 VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y10 VPXOR Y3, Y7, Y7 VPXOR Y5, Y9, Y9 VPXOR Y4, Y8, Y8 VPXOR Y6, Y10, Y10 // LEO_MULADD_256 VPAND Y5, Y2, Y11 VPSRLQ $0x04, Y5, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) // LEO_MULADD_256 VPAND Y6, Y2, Y11 VPSRLQ $0x04, Y6, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) VPXOR Y5, Y3, Y5 VPXOR Y6, Y4, Y6 VPXOR Y7, Y9, Y9 VPXOR Y8, Y10, Y10 VMOVDQU Y3, (SI) VMOVDQU Y4, 32(SI) ADDQ $0x40, SI VMOVDQU Y5, (DI) VMOVDQU Y6, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) VMOVDQU Y8, 32(R8) ADDQ $0x40, R8 VMOVDQU Y9, (AX) VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_6(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X2 VPBROADCASTB X2, Y2 loop: VMOVDQU (SI), Y3 VMOVDQU (DI), Y4 VMOVDQU 32(SI), Y5 VMOVDQU 32(DI), Y6 VPXOR Y4, Y3, Y4 VPXOR Y6, Y5, Y6 // LEO_MULADD_256 VPAND Y4, Y2, Y7 VPSRLQ $0x04, Y4, Y8 VPSHUFB Y7, Y0, Y7 VPAND Y8, Y2, Y8 VPSHUFB Y8, Y1, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) // LEO_MULADD_256 VPAND Y6, Y2, Y7 VPSRLQ $0x04, Y6, Y8 VPSHUFB Y7, Y0, Y7 VPAND Y8, Y2, Y8 VPSHUFB Y8, Y1, Y8 XOR3WAY( $0x00, Y7, Y8, Y5) VMOVDQU (R8), Y7 VMOVDQU (AX), Y8 VMOVDQU 32(R8), Y9 VMOVDQU 32(AX), Y10 VPXOR Y7, Y8, Y8 VPXOR Y9, Y10, Y10 VPXOR Y3, Y7, Y7 VPXOR Y4, Y8, Y8 VPXOR Y5, Y9, Y9 VPXOR Y6, Y10, Y10 VMOVDQU Y3, (SI) VMOVDQU Y5, 32(SI) ADDQ $0x40, SI VMOVDQU Y4, (DI) VMOVDQU Y6, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) VMOVDQU Y9, 32(R8) ADDQ $0x40, R8 VMOVDQU Y8, (AX) VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·fftDIT48_avx2_6(SB), NOSPLIT, $0-56 MOVQ t02+48(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X2 VPBROADCASTB X2, Y2 loop: VMOVDQU (SI), Y3 VMOVDQU 32(SI), Y4 VMOVDQU (R8), Y7 VMOVDQU 32(R8), Y8 VMOVDQU (DI), Y5 VMOVDQU 32(DI), Y6 VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y10 // LEO_MULADD_256 VPAND Y7, Y2, Y11 VPSRLQ $0x04, Y7, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) // LEO_MULADD_256 VPAND Y8, Y2, Y11 VPSRLQ $0x04, Y8, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) // LEO_MULADD_256 VPAND Y9, Y2, Y11 VPSRLQ $0x04, Y9, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) // LEO_MULADD_256 VPAND Y10, Y2, Y11 VPSRLQ $0x04, Y10, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) VPXOR Y3, Y7, Y7 VPXOR Y5, Y9, Y9 VPXOR Y4, Y8, Y8 VPXOR Y6, Y10, Y10 VPXOR Y5, Y3, Y5 VPXOR Y6, Y4, Y6 VPXOR Y7, Y9, Y9 VPXOR Y8, Y10, Y10 VMOVDQU Y3, (SI) VMOVDQU Y4, 32(SI) ADDQ $0x40, SI VMOVDQU Y5, (DI) VMOVDQU Y6, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) VMOVDQU Y8, 32(R8) ADDQ $0x40, R8 VMOVDQU Y9, (AX) VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·ifftDIT48_avx2_7(SB), NOSPLIT, $0-56 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X0 VPBROADCASTB X0, Y0 loop: VMOVDQU (SI), Y0 VMOVDQU (DI), Y1 VMOVDQU 32(SI), Y2 VMOVDQU 32(DI), Y3 VPXOR Y1, Y0, Y1 VPXOR Y3, Y2, Y3 VMOVDQU (R8), Y4 VMOVDQU (AX), Y5 VMOVDQU 32(R8), Y6 VMOVDQU 32(AX), Y7 VPXOR Y4, Y5, Y5 VPXOR Y6, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y1, Y5, Y5 VPXOR Y2, Y6, Y6 VPXOR Y3, Y7, Y7 VMOVDQU Y0, (SI) VMOVDQU Y2, 32(SI) ADDQ $0x40, SI VMOVDQU Y1, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI VMOVDQU Y4, (R8) VMOVDQU Y6, 32(R8) ADDQ $0x40, R8 VMOVDQU Y5, (AX) VMOVDQU Y7, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·fftDIT48_avx2_7(SB), NOSPLIT, $0-56 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX MOVQ CX, X0 VPBROADCASTB X0, Y0 loop: VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y1 VMOVDQU (R8), Y4 VMOVDQU 32(R8), Y5 VMOVDQU (DI), Y2 VMOVDQU 32(DI), Y3 VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y7 VPXOR Y0, Y4, Y4 VPXOR Y2, Y6, Y6 VPXOR Y1, Y5, Y5 VPXOR Y3, Y7, Y7 VPXOR Y2, Y0, Y2 VPXOR Y3, Y1, Y3 VPXOR Y4, Y6, Y6 VPXOR Y5, Y7, Y7 VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) ADDQ $0x40, SI VMOVDQU Y2, (DI) VMOVDQU Y3, 32(DI) ADDQ $0x40, DI VMOVDQU Y4, (R8) VMOVDQU Y5, 32(R8) ADDQ $0x40, R8 VMOVDQU Y6, (AX) VMOVDQU Y7, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t01+32(FP), Z0 VBROADCASTF32X2 t23+40(FP), Z1 VBROADCASTF32X2 t02+48(FP), Z2 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z3 VMOVDQU64 (DI), Z4 VMOVDQU64 (R8), Z5 VMOVDQU64 (AX), Z6 VXORPD Z4, Z3, Z4 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 VXORPD Z3, Z7, Z3 VXORPD Z5, Z6, Z6 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 VPTERNLOGD $0x96, Z7, Z3, Z5 VXORPD Z4, Z6, Z6 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 VXORPD Z3, Z7, Z3 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 VXORPD Z4, Z7, Z4 VMOVDQU64 Z3, (SI) ADDQ $0x40, SI VMOVDQU64 Z4, (DI) ADDQ $0x40, DI VMOVDQU64 Z5, (R8) ADDQ $0x40, R8 VMOVDQU64 Z6, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t01+32(FP), Z0 VBROADCASTF32X2 t23+40(FP), Z1 VBROADCASTF32X2 t02+48(FP), Z2 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z3 VMOVDQU64 (DI), Z4 VMOVDQU64 (R8), Z5 VMOVDQU64 (AX), Z6 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 VXORPD Z3, Z7, Z3 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 VXORPD Z4, Z7, Z4 VXORPD Z3, Z5, Z5 VXORPD Z4, Z6, Z6 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 VXORPD Z3, Z7, Z3 VXORPD Z4, Z3, Z4 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 VXORPD Z5, Z7, Z5 VXORPD Z5, Z6, Z6 VMOVDQU64 Z3, (SI) ADDQ $0x40, SI VMOVDQU64 Z4, (DI) ADDQ $0x40, DI VMOVDQU64 Z5, (R8) ADDQ $0x40, R8 VMOVDQU64 Z6, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t23+40(FP), Z0 VBROADCASTF32X2 t02+48(FP), Z1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z2 VMOVDQU64 (DI), Z3 VMOVDQU64 (R8), Z4 VMOVDQU64 (AX), Z5 VXORPD Z3, Z2, Z3 VXORPD Z4, Z5, Z5 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 VPTERNLOGD $0x96, Z6, Z2, Z4 VXORPD Z3, Z5, Z5 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 VXORPD Z2, Z6, Z2 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 VXORPD Z3, Z6, Z3 VMOVDQU64 Z2, (SI) ADDQ $0x40, SI VMOVDQU64 Z3, (DI) ADDQ $0x40, DI VMOVDQU64 Z4, (R8) ADDQ $0x40, R8 VMOVDQU64 Z5, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t01+32(FP), Z0 VBROADCASTF32X2 t23+40(FP), Z1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z2 VMOVDQU64 (DI), Z3 VMOVDQU64 (R8), Z4 VMOVDQU64 (AX), Z5 VXORPD Z2, Z4, Z4 VXORPD Z3, Z5, Z5 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 VXORPD Z2, Z6, Z2 VXORPD Z3, Z2, Z3 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 VXORPD Z4, Z6, Z4 VXORPD Z4, Z5, Z5 VMOVDQU64 Z2, (SI) ADDQ $0x40, SI VMOVDQU64 Z3, (DI) ADDQ $0x40, DI VMOVDQU64 Z4, (R8) ADDQ $0x40, R8 VMOVDQU64 Z5, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t01+32(FP), Z0 VBROADCASTF32X2 t02+48(FP), Z1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z2 VMOVDQU64 (DI), Z3 VMOVDQU64 (R8), Z4 VMOVDQU64 (AX), Z5 VXORPD Z3, Z2, Z3 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 VXORPD Z2, Z6, Z2 VXORPD Z4, Z5, Z5 VXORPD Z2, Z4, Z4 VXORPD Z3, Z5, Z5 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 VXORPD Z2, Z6, Z2 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 VXORPD Z3, Z6, Z3 VMOVDQU64 Z2, (SI) ADDQ $0x40, SI VMOVDQU64 Z3, (DI) ADDQ $0x40, DI VMOVDQU64 Z4, (R8) ADDQ $0x40, R8 VMOVDQU64 Z5, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t23+40(FP), Z0 VBROADCASTF32X2 t02+48(FP), Z1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z2 VMOVDQU64 (DI), Z3 VMOVDQU64 (R8), Z4 VMOVDQU64 (AX), Z5 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 VXORPD Z2, Z6, Z2 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 VXORPD Z3, Z6, Z3 VXORPD Z2, Z4, Z4 VXORPD Z3, Z5, Z5 VXORPD Z3, Z2, Z3 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 VXORPD Z4, Z6, Z4 VXORPD Z4, Z5, Z5 VMOVDQU64 Z2, (SI) ADDQ $0x40, SI VMOVDQU64 Z3, (DI) ADDQ $0x40, DI VMOVDQU64 Z4, (R8) ADDQ $0x40, R8 VMOVDQU64 Z5, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t02+48(FP), Z0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z1 VMOVDQU64 (DI), Z2 VMOVDQU64 (R8), Z3 VMOVDQU64 (AX), Z4 VXORPD Z2, Z1, Z2 VXORPD Z3, Z4, Z4 VXORPD Z1, Z3, Z3 VXORPD Z2, Z4, Z4 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 VXORPD Z1, Z5, Z1 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 VXORPD Z2, Z5, Z2 VMOVDQU64 Z1, (SI) ADDQ $0x40, SI VMOVDQU64 Z2, (DI) ADDQ $0x40, DI VMOVDQU64 Z3, (R8) ADDQ $0x40, R8 VMOVDQU64 Z4, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t23+40(FP), Z0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z1 VMOVDQU64 (DI), Z2 VMOVDQU64 (R8), Z3 VMOVDQU64 (AX), Z4 VXORPD Z1, Z3, Z3 VXORPD Z2, Z4, Z4 VXORPD Z2, Z1, Z2 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 VXORPD Z3, Z5, Z3 VXORPD Z3, Z4, Z4 VMOVDQU64 Z1, (SI) ADDQ $0x40, SI VMOVDQU64 Z2, (DI) ADDQ $0x40, DI VMOVDQU64 Z3, (R8) ADDQ $0x40, R8 VMOVDQU64 Z4, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t01+32(FP), Z0 VBROADCASTF32X2 t23+40(FP), Z1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z2 VMOVDQU64 (DI), Z3 VMOVDQU64 (R8), Z4 VMOVDQU64 (AX), Z5 VXORPD Z3, Z2, Z3 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 VXORPD Z2, Z6, Z2 VXORPD Z4, Z5, Z5 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 VPTERNLOGD $0x96, Z6, Z2, Z4 VXORPD Z3, Z5, Z5 VMOVDQU64 Z2, (SI) ADDQ $0x40, SI VMOVDQU64 Z3, (DI) ADDQ $0x40, DI VMOVDQU64 Z4, (R8) ADDQ $0x40, R8 VMOVDQU64 Z5, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t01+32(FP), Z0 VBROADCASTF32X2 t02+48(FP), Z1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z2 VMOVDQU64 (DI), Z3 VMOVDQU64 (R8), Z4 VMOVDQU64 (AX), Z5 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 VXORPD Z2, Z6, Z2 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 VXORPD Z3, Z6, Z3 VXORPD Z2, Z4, Z4 VXORPD Z3, Z5, Z5 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 VXORPD Z2, Z6, Z2 VXORPD Z3, Z2, Z3 VXORPD Z4, Z5, Z5 VMOVDQU64 Z2, (SI) ADDQ $0x40, SI VMOVDQU64 Z3, (DI) ADDQ $0x40, DI VMOVDQU64 Z4, (R8) ADDQ $0x40, R8 VMOVDQU64 Z5, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t23+40(FP), Z0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z1 VMOVDQU64 (DI), Z2 VMOVDQU64 (R8), Z3 VMOVDQU64 (AX), Z4 VXORPD Z2, Z1, Z2 VXORPD Z3, Z4, Z4 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 VPTERNLOGD $0x96, Z5, Z1, Z3 VXORPD Z2, Z4, Z4 VMOVDQU64 Z1, (SI) ADDQ $0x40, SI VMOVDQU64 Z2, (DI) ADDQ $0x40, DI VMOVDQU64 Z3, (R8) ADDQ $0x40, R8 VMOVDQU64 Z4, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t01+32(FP), Z0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z1 VMOVDQU64 (DI), Z2 VMOVDQU64 (R8), Z3 VMOVDQU64 (AX), Z4 VXORPD Z1, Z3, Z3 VXORPD Z2, Z4, Z4 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 VXORPD Z1, Z5, Z1 VXORPD Z2, Z1, Z2 VXORPD Z3, Z4, Z4 VMOVDQU64 Z1, (SI) ADDQ $0x40, SI VMOVDQU64 Z2, (DI) ADDQ $0x40, DI VMOVDQU64 Z3, (R8) ADDQ $0x40, R8 VMOVDQU64 Z4, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t01+32(FP), Z0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z1 VMOVDQU64 (DI), Z2 VMOVDQU64 (R8), Z3 VMOVDQU64 (AX), Z4 VXORPD Z2, Z1, Z2 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 VXORPD Z1, Z5, Z1 VXORPD Z3, Z4, Z4 VXORPD Z1, Z3, Z3 VXORPD Z2, Z4, Z4 VMOVDQU64 Z1, (SI) ADDQ $0x40, SI VMOVDQU64 Z2, (DI) ADDQ $0x40, DI VMOVDQU64 Z3, (R8) ADDQ $0x40, R8 VMOVDQU64 Z4, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56 VBROADCASTF32X2 t02+48(FP), Z0 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z1 VMOVDQU64 (DI), Z2 VMOVDQU64 (R8), Z3 VMOVDQU64 (AX), Z4 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 VXORPD Z1, Z5, Z1 // LEO_MULADD_512 VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 VXORPD Z2, Z5, Z2 VXORPD Z1, Z3, Z3 VXORPD Z2, Z4, Z4 VXORPD Z2, Z1, Z2 VXORPD Z3, Z4, Z4 VMOVDQU64 Z1, (SI) ADDQ $0x40, SI VMOVDQU64 Z2, (DI) ADDQ $0x40, DI VMOVDQU64 Z3, (R8) ADDQ $0x40, R8 VMOVDQU64 Z4, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z0 VMOVDQU64 (DI), Z1 VMOVDQU64 (R8), Z2 VMOVDQU64 (AX), Z3 VXORPD Z1, Z0, Z1 VXORPD Z2, Z3, Z3 VXORPD Z0, Z2, Z2 VXORPD Z1, Z3, Z3 VMOVDQU64 Z0, (SI) ADDQ $0x40, SI VMOVDQU64 Z1, (DI) ADDQ $0x40, DI VMOVDQU64 Z2, (R8) ADDQ $0x40, R8 VMOVDQU64 Z3, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET // func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX XORQ BX, BX MOVQ (CX)(BX*1), SI ADDQ AX, BX MOVQ (CX)(BX*1), DI ADDQ AX, BX MOVQ (CX)(BX*1), R8 ADDQ AX, BX MOVQ (CX)(BX*1), AX loop: VMOVDQU64 (SI), Z0 VMOVDQU64 (DI), Z1 VMOVDQU64 (R8), Z2 VMOVDQU64 (AX), Z3 VXORPD Z0, Z2, Z2 VXORPD Z1, Z3, Z3 VXORPD Z1, Z0, Z1 VXORPD Z2, Z3, Z3 VMOVDQU64 Z0, (SI) ADDQ $0x40, SI VMOVDQU64 Z1, (DI) ADDQ $0x40, DI VMOVDQU64 Z2, (R8) ADDQ $0x40, R8 VMOVDQU64 Z3, (AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET